Commit e847cf8a authored by bdf's avatar bdf Committed by Zaida Zhou
Browse files

[Refactor] Adapt mlu code to cntoolkit3.0.1

parent 4c6e99c8
......@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__nramset(vec_bottom, batches_stride, 0.f);
__bang_write_value(vec_bottom, batches_stride, 0.f);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
......@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
......@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
......@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height;
// offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset));
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
......@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram
__nramset(vec_b1_x1, batches_stride, bbox1[base1]);
__nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
__bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
__bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride;
......@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__nramset(vec_bottom, batches_stride, (T)0);
__bang_write_value(vec_bottom, batches_stride, (T)0);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
......@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
......@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
......@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height;
// offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset));
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
......
......@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
// set output_nram to zero
__nramset(output_nram, param.output_nram_size, T(0));
__bang_write_value(output_nram, param.output_nram_size, T(0));
// loop blocks of kernel window: grid_dim.(Kh, Kw)
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
......@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
T *sum = sum_array;
for (int g = 0; g < blkSize.G; ++g) {
__bang_mul_const(sum, src, mask_array[mask_index],
param.block_Cg_NFU);
__bang_mul_scalar(sum, src, mask_array[mask_index],
param.block_Cg_NFU);
//
// NOTE: Since block_Cg_NFU >= block_Cg_stride,
// overlapped writing may occur on sum_array.
......@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
GDRAM2NRAM);
__bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align);
__bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, num_align);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
......@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
GDRAM2NRAM);
__bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, rem_for_loop);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
......@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
const int wi, const int c, const int k_up,
const int group, const int scale) {
if (dtype == CNRT_FLOAT16) {
backward::MLUUnion1KernelCarafeBackward<half>
<<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up, group, scale);
backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
group, scale);
} else {
backward::MLUUnion1KernelCarafeBackward<float>
<<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up, group, scale);
backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
group, scale);
}
}
......@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
// get sign bit
const float move_23bit = 8388608.0;
// 0x80000000 = 1,000000000,0000000000000000000000000000
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000);
__bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE);
// get 1 or 0 from sign bit
// judg is Odd
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001);
__bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * sizeof(float),
NFU_ALIGN_SIZE);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000001);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000001);
__bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float));
// minus xor, positive num invariant
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff);
__bang_cycle_mul(dst, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float));
__bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
// convert int32 to float32
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x7fffff);
__bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x4b000000);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x4b000000);
__bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE);
__bang_sub_const(dst, dst, move_23bit, src_count);
__bang_sub_scalar(dst, dst, move_23bit, src_count);
// add one
__bang_add(dst, dst, dst_addition, src_count);
// set sign for float32
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff);
__bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float));
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001);
__bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float));
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000);
__bang_cycle_band((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * 4, 128);
__bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
......@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// dst_addition = abs(src)
__bang_mul(dst_addition, src, (float *)dst, src_count);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f);
__bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
1.0f);
__bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float));
__bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xbf800000);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xbf800000);
// set negative flag -1.0 = 0xbf80000
__bang_cycle_eq(
(float *)dst, (float *)dst, (float *)src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0]
__bang_active_abs(dst_addition, src, src_count);
__nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f);
__bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
8388608.0f);
// mask shift move 23
__bang_cycle_add_tz(
dst_addition, dst_addition, src_addition, src_count,
......@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value.
__bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count);
__bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
__bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
src_count * floatDchar);
// get low 23bit
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
(unsigned)0x007fffff);
__bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
(unsigned)0x007fffff);
// mask low 23bit is 1
__bang_cycle_band((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * floatDchar,
......@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
__bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
__bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float));
// src or dst_addition
__bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
src_count * floatDchar);
__bang_mul_const((float *)dst, (float *)dst, -2.0, src_count);
__bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
__bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
src_count * floatDchar);
#endif // __BANG_ARCH__ >= 300
}
/*!
* @brief Converts float32 to half data type,
* the rounding mode on MLU200 is rd, on MLU300 is rn.
*
* @param[out] dst
* Pointer to NRAM that stores half type data.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__ inline void convertFloat2half(half *dst, float *src,
int src_count) {
#if __BANG_ARCH__ >= 300
__bang_float2half_rn(dst, src, src_count);
#else
__bang_float2half_rd(dst, src, src_count);
#endif
}
#endif // COMMON_MLU_HELPER_HPP_
......@@ -9,14 +9,9 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "nms_utils.hpp"
#define NMS_SIZE (64)
#define COORD_DIM (4)
#define MEMORY_CORE (0x80)
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
......@@ -24,348 +19,129 @@
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
__mlu_func__ void pvLock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_lock(0, 0);
}
#endif
}
__mlu_func__ void pvUnlock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_unlock(0, 0);
}
#endif
}
enum Addr { SRAM, GDRAM };
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection(
uint32_t *output_box_num, const int output_mode, const int input_layout,
OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
const int buffer_size, IN_DT *sram, const int core_limit,
const int input_box_num, const int input_stride, const int output_stride,
const int keepNum, const float thresh_iou, const float thresh_score,
uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
IN_DT *sram, const int core_limit, const int input_num_boxes,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float offset, const int algo) {
// global value, it is stored in sram with a offset from the begin.
const int flag_offset_size = 28;
int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
loop_end_flag[0] = 0;
// global value
int32_t *exit_flag = (int32_t *)(sram + 28);
exit_flag[0] = 0;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
const int nms_buffer_count1 = 9;
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
const int nram_save_limit_count = 256;
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr;
input_score_ptr = input_data_score;
input_x1_ptr = input_data_box;
if (input_layout == 0) {
// [boxes_num, 4]
input_y1_ptr = input_x1_ptr + 1;
input_x2_ptr = input_x1_ptr + 2;
input_y2_ptr = input_x1_ptr + 3;
} else if (input_layout == 1) {
// [4, boxes_num]
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
}
// nram data ptr
IN_DT *x1;
IN_DT *y1;
IN_DT *x2;
IN_DT *y2;
IN_DT *score;
IN_DT *inter_x1;
IN_DT *inter_y1;
IN_DT *inter_x2;
IN_DT *inter_y2;
IN_DT *max_box; // the max score, x1, y1, x2, y2
IN_DT *x1_mask;
IN_DT *y1_mask;
IN_DT *x2_mask;
IN_DT *y2_mask;
OUT_DT *nram_save;
const IN_DT *input_x1_ptr = input_data_box;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
int limit = 0; // find limit when GDRAM or SRAM
int len_core = 0; // the length deal by every core
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int input_offset = 0; // offset of input_data for current core
int nram_save_count = 0;
// mask for collect x1, y1, x2, y2. each mask has 128 elements
const int mask_size = 128;
const int total_mask_size = 512;
if (output_mode == 0) {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT) -
total_mask_size * sizeof(IN_DT)) /
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
total_mask_size * sizeof(IN_DT)) /
// 5 maens: score, x1, y1, x2, y2
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * 5 * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
if (core_limit == 1) {
len_core = input_box_num;
input_offset = 0;
} else {
int avg_core = input_box_num / core_limit;
int rem = input_box_num % core_limit;
len_core = avg_core + (taskId < rem ? 1 : 0);
input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
}
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
repeat = len_core / max_seg_pad;
remain = len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
int max_seg_iou_compute = 0;
int repeat_iou_compute = 0;
int remain_iou_compute = 0;
int remain_pad_iou_compute = 0;
// if datatype is half, we should convert it to float when compute the IoU
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
// initial the address point
score = buffer;
x1 = score + max_seg_pad;
y1 = x1 + max_seg_pad;
x2 = y1 + max_seg_pad;
y2 = x2 + max_seg_pad;
inter_x1 = y2 + max_seg_pad;
inter_y1 = inter_x1 + max_seg_pad;
inter_x2 = inter_y1 + max_seg_pad;
inter_y2 = inter_x2 + max_seg_pad;
x1_mask = inter_y2 + max_seg_pad;
y1_mask = x1_mask + mask_size;
x2_mask = y1_mask + mask_size;
y2_mask = x2_mask + mask_size;
max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2
// offset two line from max_box
nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
input_offset, max_seg_pad, repeat, remain,
remain_pad, max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// set mask for __bang_collect instruction
if (input_layout == 0) {
__nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
for (int idx = 0; idx < mask_size; idx++) {
int index = (idx % COORD_DIM) * mask_size + idx;
x1_mask[index] = (IN_DT)1.0;
}
}
// init the data ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
if (core_limit != 1) {
__sync_cluster(); // sync before current loop
}
/******find max start******/
/******FIND MAX START******/
int max_index = 0; // the max score index
int global_max_index = 0; // for U1
float max_area = 0; // the max score area
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num that half data type could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
// seg length exceeds the max num for fp16 datatype!
return;
}
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
/******nms load start******/
mluMemcpyDirection_t load_dir = SRAM2NRAM;
if (src == SRAM) {
load_dir = SRAM2NRAM;
} else {
load_dir = GDRAM2NRAM;
}
__nramset(score, seg_len, (IN_DT)0);
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
/******nms load end******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
int stride = 1;
if (input_layout == 0) {
stride = input_stride;
} else if (input_layout == 1) {
stride = 1;
}
findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
input_offset, repeat, remain, remain_pad, max_seg_pad,
max_index);
if (core_limit == 1) {
max_box[1] = input_x1_ptr[max_index * stride];
max_box[2] = input_y1_ptr[max_index * stride];
max_box[3] = input_x2_ptr[max_index * stride];
max_box[4] = input_y2_ptr[max_index * stride];
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
input_score_ptr[max_index] = 0;
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
input_data_score[max_index] = 0;
global_max_index = max_index;
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
} else if (core_limit == 4) {
// find the max with sram
// the max box's x1, y1, x2, y2 on every core
if (coreId != MEMORY_CORE) {
max_box[1] = input_x1_ptr[max_index * stride];
max_box[2] = input_y1_ptr[max_index * stride];
max_box[3] = input_x2_ptr[max_index * stride];
max_box[4] = input_y2_ptr[max_index * stride];
}
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
for (int i = 0; i < INFO_NUM; i++) {
__memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
NRAM2SRAM);
}
// copy every core's max_index to sram, use 2 half to store max_index
__memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
sizeof(uint32_t),
NRAM2SRAM); // int32_t datatype
__sync_cluster();
findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
// copy score from sram to nram and find the max
__nramset(inter_x1, NMS_SIZE, (IN_DT)0);
__memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_core = 0;
if (sizeof(IN_DT) == sizeof(half)) {
max_core = ((uint16_t *)max_box)[1];
} else if (sizeof(IN_DT) == sizeof(float)) {
max_core = ((uint32_t *)max_box)[1];
}
// copy the max box from SRAM to NRAM
__memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x1
__memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y1
__memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x2
__memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y2
__memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
sizeof(uint32_t), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
input_score_ptr[global_max_index] = 0;
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
global_max_index = ((uint32_t *)(max_box + 5))[0];
input_data_score[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******find max end******/
/******nms store start******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
__memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
1 * sizeof(uint32_t), 0);
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
(*output_box_num)++;
}
/******FIND MAX END******/
// store to sram/gdram
if (*output_box_num != 0) {
mluMemcpyDirection_t store_dir = NRAM2GDRAM;
if (dst == SRAM) {
store_dir = NRAM2SRAM;
} else { // dst == GDRAM
store_dir = NRAM2GDRAM;
}
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
if (nram_save_count != 0) {
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * sizeof(uint32_t), store_dir);
pvUnlock();
output_data += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
pvUnlock();
output_data += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
store_dir, output_stride * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_data += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
max_output_size, thresh_score, output_mode, nram_save_count,
output_box_num);
// if the max score <= 0, end
if (core_limit == 1) {
......@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection(
} else {
if (float(max_box[0]) <= thresh_score) {
if (coreId == 0) {
loop_end_flag[0] = 1;
exit_flag[0] = 1;
}
}
__sync_cluster();
if (loop_end_flag[0] == 1) {
if (exit_flag[0] == 1) {
break;
}
}
/******nms store end******/
// To solve half data accuracy, we convert half to float to calculate IoU.
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
: seg_len = max_seg_iou_compute;
i == repeat_iou_compute ? cpy_len = remain_iou_compute
: cpy_len = max_seg_iou_compute;
/******nms load start******/
mluMemcpyDirection_t load_dir = SRAM2NRAM;
if (src == SRAM) {
load_dir = SRAM2NRAM;
} else {
load_dir = GDRAM2NRAM;
}
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
if (input_layout == 0) {
// the following number 4 means x1, y1, x2, y2
__memcpy(
inter_x1,
input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
cpy_len * COORD_DIM * sizeof(IN_DT),
cpy_len * COORD_DIM * sizeof(IN_DT), 0);
// here use collect instruction to transpose the [n, 4] shape into [4,
// n] shape to avoid
// discrete memory accessing.
for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
// the following number 32 means 32 elements will be selected out by
// once operation
__bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x1_mask, mask_size);
__bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y1_mask, mask_size);
__bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x2_mask, mask_size);
__bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y2_mask, mask_size);
}
} else if (input_layout == 1) {
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y1 + dt_offset,
input_y1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(x2 + dt_offset,
input_x2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y2 + dt_offset,
input_y2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
}
/******nms load end******/
/******nms compute start******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******nms compute end******/
// update the score
mluMemcpyDirection_t update_dir = NRAM2SRAM;
if (dst == SRAM) {
update_dir = NRAM2SRAM;
} else {
update_dir = NRAM2GDRAM;
}
if (sizeof(IN_DT) == sizeof(half)) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for keepNum
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
max_box_y1, max_box_x2, max_box_y2, nram_save,
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
input_offset, offset, max_area, input_num_boxes, algo);
#else
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
max_box[2], max_box[3], max_box[4], nram_save,
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
input_offset, offset, max_area, input_num_boxes, algo);
#endif
} // for max_output_size
}
__mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_stride,
const int max_output_size, const float iou_threshold,
const float confidence_threshold, const int mode, const int input_layout,
void *workspace, void *result_num, void *output,
const int input_num_boxes, const int max_output_size,
const float iou_threshold, const float confidence_threshold,
const int output_mode, void *workspace, void *result_num, void *output,
const cnrtDataType_t data_type_input, const float offset, const int algo) {
if (data_type_input == CNRT_FLOAT16) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
......@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS(
} else {
}
int output_stride = max_output_size;
uint32_t result_box_num = 0;
if (mode == 0) {
uint32_t *out_data = (uint32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *boxes_data = (half *)input_boxes;
half *confi_data = (half *)workspace;
half *buffer = (half *)nram_buffer;
half *sram = (half *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
uint32_t output_box_num = 0;
float *score_data = (float *)workspace;
float *boxes_data = (float *)input_boxes;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
if (output_mode == 0) {
if (data_type_input == CNRT_FLOAT32) {
nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, algo);
} else {
nms_detection(output_box_num, output_mode, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
} else {
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *boxes_data = (half *)input_boxes;
half *confi_data = (half *)workspace;
half *out_data = (half *)output;
half *buffer = (half *)nram_buffer;
half *sram = (half *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *out_data = (float *)output;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
if (data_type_input == CNRT_FLOAT32) {
nms_detection(output_box_num, output_mode, (float *)output, score_data,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, algo);
} else {
nms_detection(output_box_num, output_mode, (half *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux(
int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_layout, const int input_num_boxes, const int input_stride,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float offset, const int output_mode, const int algo) {
loop_end_flag[0] = 0;
const int input_num_boxes, const int max_output_size,
const float thresh_iou, const float thresh_score, const float offset,
const int output_mode, const int algo) {
exit_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
......@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux(
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr;
input_score_ptr = score_data;
input_x1_ptr = boxes_data;
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
const IN_DT *input_x1_ptr = boxes_data;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
......@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux(
(nms_buffer_count1 * sizeof(IN_DT));
}
// data split
int avg_cluster = input_num_boxes / clusterDim;
int rem_cluster = input_num_boxes % clusterDim;
int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
int cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
int input_offset = cluster_offset + core_offset;
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_avg_core = input_num_boxes / clusterDim;
int max_index_rem_core = input_num_boxes % clusterDim;
int max_index_len_core =
max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
int max_index_input_offset =
max_index_avg_core * clusterId +
(clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
int input_offset = 0;
int max_seg_iou_compute = 0;
int repeat_iou_compute = 0;
int remain_iou_compute = 0;
int remain_pad_iou_compute = 0;
getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
max_seg_pad, repeat, remain, remain_pad,
max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
......@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux(
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
__sync_all();
/******FIND MAX START******/
int max_index = 0;
int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
if (coreId == 0) {
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = (i == repeat)
? remain_pad
: max_seg_pad; // the length every nms compute
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num
// that fp16 could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
return;
}
int cpy_len = (i == repeat)
? remain
: max_seg_pad; // the length every nms memcpy
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score,
input_score_ptr + max_index_input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index =
((uint16_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index =
((uint32_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every cluster
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
input_offset, repeat, remain, remain_pad, max_seg_pad,
max_index);
// copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_all();
// copy all partial max to the sram of cluster 0
if (clusterId != 0) {
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
#if __BANG_ARCH__ <= 372
findGlobalMaxBox(max_box, sram, inter_x1);
#endif
// copy the global max box to max_box
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId != 0x80) {
input_score_ptr[global_max_index] = 0;
if (coreId != MEMORY_CORE) {
score_data[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
/******NMS STORE START******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
max_output_size, thresh_score, output_mode, nram_save_count,
output_box_num);
if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) {
loop_end_flag[0] = 1; // dram
exit_flag[0] = 1; // dram
}
}
__sync_all();
if (loop_end_flag[0] == 1) {
if (exit_flag[0] == 1) {
break;
}
/******NMS STORE END******/
// To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
: max_seg_iou_compute;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
/******NMS LOAD START******/
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
/******NMS LOAD END******/
/******NMS COMPUTE START******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
if (sizeof(IN_DT) == 2) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), input_store_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for max_output_size
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
max_area, input_num_boxes, algo);
#else
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
max_box[3], max_box[4], nram_save, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
max_area, input_num_boxes, algo);
#endif
} // for max_output_size
}
__mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_layout, const int input_stride,
const int max_output_size, const float iou_threshold,
const float confidence_threshold, const float offset,
const cnrtDataType_t data_type_input, const int output_mode, const int algo,
void *workspace, void *result_num, void *output) {
const int input_num_boxes, const int max_output_size,
const float iou_threshold, const float confidence_threshold,
const float offset, const cnrtDataType_t data_type_input,
const int output_mode, const int algo, void *workspace, void *result_num,
void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *loop_end_flag =
(int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
int32_t *exit_flag = (int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
......@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS(
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
}
__sync_cluster();
uint32_t output_box_num = 0;
float *score_data;
float *boxes_data;
score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
if (output_mode == 0) {
uint32_t *output_dram = (uint32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *score_data;
half *boxes_data;
score_data =
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
boxes_data =
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
if (data_type_input == CNRT_FLOAT32) {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
score_data, boxes_data, input_ram, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, output_mode, algo);
} else {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, input_ram,
input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
}
} else {
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *output_dram = (half *)output;
half *score_data;
half *boxes_data;
score_data =
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
boxes_data =
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *output_dram = (float *)output;
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
if (data_type_input == CNRT_FLOAT32) {
nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
boxes_data, input_ram, input_num_boxes, max_output_size,
iou_threshold, confidence_threshold, offset, output_mode,
algo);
} else {
nms_detection_ux(exit_flag, output_box_num, (half *)output,
(half *)score_data, (half *)boxes_data, input_ram,
input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int input_stride, const int max_output_boxes,
const float iou_threshold, const float offset,
void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
const int max_output_boxes, const float iou_threshold,
const float offset, void *workspace_ptr, void *output_size_ptr,
void *output_ptr) {
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, input_stride,
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0,
/*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
/*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
}; break;
case CNRT_FUNC_TYPE_UNION2:
......@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
input_stride, max_output_boxes, iou_threshold,
/*confidence_threshold=*/0.0, offset, data_type_input,
/*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
output_ptr);
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
output_size_ptr, output_ptr);
}; break;
}
}
/*************************************************************************
* Copyright (C) [2019-2022] by Cambricon, Inc.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef NMS_UTILS_HPP_
#define NMS_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define NMS_SIZE (64)
#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define NMS_DOWN(x, y) (x / y) * y
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define MEMORY_CORE (0x80)
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
__mlu_func__ void pvLock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_lock(0, 0);
}
#endif
}
__mlu_func__ void pvUnlock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_unlock(0, 0);
}
#endif
}
template <typename T>
static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
const int deal_num,
const T threshold = 0) {
if (threshold < 0) {
return;
}
if (threshold) {
#if __BANG_ARCH__ >= 300
__bang_relun(nram_dst, nram_src, deal_num, threshold);
#else
int align_num = NFU_ALIGN_SIZE / sizeof(T);
T *nram_aux_a = (T *)nram_tmp;
T *nram_aux_b = nram_aux_a + deal_num;
T *nram_zero = nram_aux_b + align_num;
__bang_write_value(nram_aux_b, align_num, threshold);
__bang_write_zero(nram_zero, align_num);
__bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
align_num);
__bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
__bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
align_num);
__bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
deal_num, align_num);
__bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
__bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
align_num);
__bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
#endif
} else {
#if __BANG_ARCH__ >= 300
__bang_relu(nram_dst, nram_src, deal_num);
#else
__bang_active_relu(nram_dst, nram_src, deal_num);
#endif
}
}
__mlu_func__ void getComputeParamsBlockOrU1(
const int input_dwidth, const int input_box_num, const int limit,
const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
int &remain, int &remain_pad, int &max_seg_iou_compute,
int &repeat_iou_compute, int &remain_iou_compute,
int &remain_pad_iou_compute) {
int avg_core = input_box_num / core_limit;
int rem = input_box_num % core_limit;
int len_core = avg_core + (coreId < rem ? 1 : 0);
input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
repeat = len_core / max_seg_pad;
remain = len_core % max_seg_pad;
remain_pad = NMS_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
repeat_iou_compute = len_core / max_seg_iou_compute;
remain_iou_compute = len_core % max_seg_iou_compute;
remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
}
__mlu_func__ void getComputeParamsUx(
const int input_dwidth, const int input_num_boxes, const int limit,
int &input_offset, int &max_seg_pad, int &repeat, int &remain,
int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
int &remain_iou_compute, int &remain_pad_iou_compute) {
// data split
int avg_cluster = input_num_boxes / clusterDim;
int rem_cluster = input_num_boxes % clusterDim;
int len_cluster = avg_cluster + (clusterId < rem_cluster);
int cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
input_offset = cluster_offset + core_offset;
max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = NMS_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute =
NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
repeat_iou_compute = len_core / max_seg_iou_compute;
remain_iou_compute = len_core % max_seg_iou_compute;
remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
}
template <typename IN_DT>
__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
IN_DT *inter_x1) {
// copy all partial max to the sram of cluster 0
if (clusterId != 0) {
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
// copy the global max box to max_box
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
}
template <typename IN_DT>
__mlu_func__ void findCoreMaxBox(
IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
const mluMemcpyDirection_t load_dir, const int input_offset,
const int repeat, const int remain, const int remain_pad,
const int max_seg_pad, int &max_index) {
if (coreId != 0x80) {
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every core
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
}
}
template <typename IN_DT>
__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
IN_DT *inter_x1, IN_DT *input_data_score,
const int core_limit) {
// find the max with sram
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
__memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
NRAM2SRAM); // int32_t datatype
__sync_cluster();
// copy score from sram to nram and find the max
__bang_write_zero(inter_x1, 64);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
__bang_max(max_box, inter_x1, 64);
int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
// copy the max box to max_box
__memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
SRAM2NRAM);
}
/*****************************************************************************/
/*******************************CALCULATE MAX AREA****************************/
/*****************************************************************************/
template <typename IN_DT>
__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
float &max_area) {
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
}
template <typename IN_DT>
__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
float &max_area, float &max_box_x1,
float &max_box_y1, float &max_box_x2,
float &max_box_y2) {
// the case of random inf will break the requirement of x1<=x2, y1<=y2
// so exchange it if it happens.
max_box_x1 = float(max_box[1]);
max_box_x2 = float(max_box[3]);
if (max_box[1] > max_box[3]) {
max_box_x1 = float(max_box[3]);
max_box_x2 = float(max_box[1]);
}
max_box_y1 = float(max_box[2]);
max_box_y2 = float(max_box[4]);
if (max_box[2] > max_box[4]) {
max_box_y1 = float(max_box[4]);
max_box_y2 = float(max_box[2]);
}
if (algo == 0 || offset == 0.0) {
max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
} else {
max_area =
(max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
}
}
/***********************************************************************/
/*******************************STORE RESULT****************************/
/***********************************************************************/
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
OUT_DT *&output_dram, const int keep,
const int nram_save_limit_count,
const int max_output_size,
const float thresh_score, const int output_mode,
int &nram_save_count, uint32_t &output_box_num) {
/******NMS STORE START******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void scoreUpdate(
IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
const float max_box_y1, const float max_box_x2, const float max_box_y2,
OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
const float thresh_iou, const float div_thresh_iou, const int input_offset,
const float offset, const float max_area, const int input_num_boxes,
const int algo) {
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
: max_seg_iou_compute;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
/******NMS LOAD START******/
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
#if __BANG_ARCH__ >= 300
__memcpy(inter_x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
input_num_boxes * sizeof(IN_DT), 3);
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)inter_x1,
(half *)inter_x1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)inter_y1,
(half *)inter_y1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)inter_x2,
(half *)inter_x2 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)inter_y2,
(half *)inter_y2 + max_seg_iou_compute, seg_len);
}
// box transfer
__bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
__bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
__bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
__bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
// 1、 compute IOU
// get the area_I
__bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
seg_len); // inter_x1
__bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
seg_len); // inter_w
__bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
seg_len); // inter_y1
__bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
if (algo == 1 && offset != 0.0) {
__bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
offset, seg_len, seg_len);
__bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
offset, seg_len, seg_len);
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
} else {
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
(float *)inter_y1, seg_len, seg_len);
}
// get the area_U: area + max_area - area_I
__bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
(float *)inter_x1, seg_len, seg_len);
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
// process for nan
__bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
__bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
#else
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
input_num_boxes * sizeof(IN_DT), 3);
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
}
// 1、 compute IOU
// get the area_I
__bang_write_value((float *)inter_y1, seg_len,
float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__bang_write_value((float *)inter_y2, seg_len,
float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
seg_len); // inter_w
__bang_write_value((float *)inter_x2, seg_len,
float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__bang_write_value((float *)inter_x2, seg_len,
float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thresh, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
#endif
// update the score
if (sizeof(IN_DT) == sizeof(half)) {
convertFloat2half((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
pvUnlock();
}
}
#endif // NMS_UTILS_HPP_
......@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst,
src + position.n_start * n_offset + position.h_start * h_offset +
position.w_start * w_offset,
__memcpy(dst, src + position.n_start * n_offset +
position.h_start * h_offset + position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1);
}
......@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram, elem_count, (T)0);
__bang_write_value(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
......@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram_temp, elem_count, (T)0);
__bang_write_value(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c;
......@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T));
__nramset(dx_nram, elem_count, (T)0);
__bang_write_value(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
......@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
// fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
__bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
......
......@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
......@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
} // loop_roi_grid_w
} // loop_roi_grid_h
T count_value = (T)(1.0 / count);
__bang_mul_const(nram_out, nram_out, count_value, align_channel);
__bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num
}
......@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio,
(half)spatial_scale, num_rois);
input_width, sampling_ratio, (half)spatial_scale,
num_rois);
}; break;
case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
......@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c);
......@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
}
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once,
......
......@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
}
if (is_empty) {
__nramset((T *)nram_out, c_slice_align, (T)0);
__bang_write_value((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
__nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
}
......@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
}
__bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align);
__bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align);
__bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align);
__bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align);
/*compute input_w*/
__bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align);
__bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align);
__bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align);
__bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align);
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
......@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
rois_num, (float)spatial_scale, (float *)output_data,
argmax);
}; break;
default: {
break;
}
default: { break; }
}
}
} // namespace forward
......@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
align_c);
// Perform 'temp_result - hstart' operation
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c);
__bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c);
// Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c);
__bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
(float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation
__bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
align_c);
__bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
wstart, align_c);
// Perform 'temp_result = h * w_compute + w' operation
__bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c);
__bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) {
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c);
__bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c);
}
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
......@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
(const T *)grads + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
__memcpy(
(T *)nram_grads + align_c * high_precision,
(const T *)grads +
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
__memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
// Perform pooling operation on NRAM.
......@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
(const T *)grads + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
__memcpy(
(T *)nram_grads + align_c * high_precision,
(const T *)grads +
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
__memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
int ping_pong = 0;
......@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision);
}; break;
default: {
break;
}
default: { break; }
}
}
} // namespace backward
......
......@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0);
__bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
......@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0);
__bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment