Commit e847cf8a authored by bdf's avatar bdf Committed by Zaida Zhou
Browse files

[Refactor] Adapt mlu code to cntoolkit3.0.1

parent 4c6e99c8
...@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left // right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride); __bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride); __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right // bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride); __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride); __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom // zero vector ---> bottom
__nramset(vec_bottom, batches_stride, 0.f); __bang_write_value(vec_bottom, batches_stride, 0.f);
// width --> vec_left // width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area // get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top // (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride); __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom // (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top; // ---> vec_top;
...@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area // get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1 // (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1 // (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1; // ---> b2_x1;
...@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height; T *inter_s = height;
// offset vector ---> vec_b2_y1 // offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset)); __bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1; T *vec_offset = vec_b2_y1;
if (mode == 0) { if (mode == 0) {
...@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
int32_t base1 = b1 * COORD_NUM; int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram // set bbox1 and bbox2 to nram
__nramset(vec_b1_x1, batches_stride, bbox1[base1]); __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
__nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]); __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]); __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]); __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) { for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride; int32_t index2 = j * batches_stride;
...@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left // right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride); __bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride); __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right // bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride); __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride); __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom // zero vector ---> bottom
__nramset(vec_bottom, batches_stride, (T)0); __bang_write_value(vec_bottom, batches_stride, (T)0);
// width --> vec_left // width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area // get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top // (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride); __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom // (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top; // ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride); __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
...@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area // get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1 // (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1 // (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1; // ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
...@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height; T *inter_s = height;
// offset vector ---> vec_b2_y1 // offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset)); __bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1; T *vec_offset = vec_b2_y1;
if (mode == 0) { if (mode == 0) {
......
...@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask, ...@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1; blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
// set output_nram to zero // set output_nram to zero
__nramset(output_nram, param.output_nram_size, T(0)); __bang_write_value(output_nram, param.output_nram_size, T(0));
// loop blocks of kernel window: grid_dim.(Kh, Kw) // loop blocks of kernel window: grid_dim.(Kh, Kw)
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) { for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
...@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask, ...@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
T *sum = sum_array; T *sum = sum_array;
for (int g = 0; g < blkSize.G; ++g) { for (int g = 0; g < blkSize.G; ++g) {
__bang_mul_const(sum, src, mask_array[mask_index], __bang_mul_scalar(sum, src, mask_array[mask_index],
param.block_Cg_NFU); param.block_Cg_NFU);
// //
// NOTE: Since block_Cg_NFU >= block_Cg_stride, // NOTE: Since block_Cg_NFU >= block_Cg_stride,
// overlapped writing may occur on sum_array. // overlapped writing may occur on sum_array.
...@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output, ...@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index; T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T), __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
GDRAM2NRAM); GDRAM2NRAM);
__bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff, __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align); ((T *)mask_buff)[mask_index], num_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input, __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, num_align); (T *)grad_input_buff, num_align);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff, __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
...@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output, ...@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index; T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T), __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
GDRAM2NRAM); GDRAM2NRAM);
__bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff, __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align); ((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input, __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, rem_for_loop); (T *)grad_input_buff, rem_for_loop);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff, __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
...@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, ...@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
const int wi, const int c, const int k_up, const int wi, const int c, const int k_up,
const int group, const int scale) { const int group, const int scale) {
if (dtype == CNRT_FLOAT16) { if (dtype == CNRT_FLOAT16) {
backward::MLUUnion1KernelCarafeBackward<half> backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
<<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input, input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
grad_mask, n, hi, wi, c, k_up, group, scale); group, scale);
} else { } else {
backward::MLUUnion1KernelCarafeBackward<float> backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
<<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input, input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
grad_mask, n, hi, wi, c, k_up, group, scale); group, scale);
} }
} }
...@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src, ...@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
// get sign bit // get sign bit
const float move_23bit = 8388608.0; const float move_23bit = 8388608.0;
// 0x80000000 = 1,000000000,0000000000000000000000000000 // 0x80000000 = 1,000000000,0000000000000000000000000000
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000); 0x80000000);
__bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE); src_count * sizeof(float), NFU_ALIGN_SIZE);
// get 1 or 0 from sign bit // get 1 or 0 from sign bit
// judg is Odd // judg is Odd
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001); 0x00000001);
__bang_cycle_bor((char *)dst_addition, (char *)dst_addition, __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * sizeof(float), (char *)src_addition, src_count * sizeof(float),
NFU_ALIGN_SIZE); NFU_ALIGN_SIZE);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000001); 0x80000001);
__bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
// minus xor, positive num invariant // minus xor, positive num invariant
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff); 0xffffffff);
__bang_cycle_mul(dst, dst_addition, src_addition, src_count, __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float)); __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
// convert int32 to float32 // convert int32 to float32
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x7fffff);
__bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE); src_count * sizeof(float), NFU_ALIGN_SIZE);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x4b000000); 0x4b000000);
__bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE); src_count * sizeof(float), NFU_ALIGN_SIZE);
__bang_sub_const(dst, dst, move_23bit, src_count); __bang_sub_scalar(dst, dst, move_23bit, src_count);
// add one // add one
__bang_add(dst, dst, dst_addition, src_count); __bang_add(dst, dst, dst_addition, src_count);
// set sign for float32 // set sign for float32
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff); 0xffffffff);
__bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001); 0x00000001);
__bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000); 0x80000000);
__bang_cycle_band((char *)dst_addition, (char *)dst_addition, __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * 4, 128); (char *)src_addition, src_count * 4, 128);
__bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4); __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
...@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, ...@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// dst_addition = abs(src) // dst_addition = abs(src)
__bang_mul(dst_addition, src, (float *)dst, src_count); __bang_mul(dst_addition, src, (float *)dst, src_count);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error. // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f); __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
1.0f);
__bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xbf800000); 0xbf800000);
// set negative flag -1.0 = 0xbf80000 // set negative flag -1.0 = 0xbf80000
__bang_cycle_eq( __bang_cycle_eq(
(float *)dst, (float *)dst, (float *)src_addition, src_count, (float *)dst, (float *)dst, (float *)src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0] NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0]
__bang_active_abs(dst_addition, src, src_count); __bang_active_abs(dst_addition, src, src_count);
__nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f); __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
8388608.0f);
// mask shift move 23 // mask shift move 23
__bang_cycle_add_tz( __bang_cycle_add_tz(
dst_addition, dst_addition, src_addition, src_count, dst_addition, dst_addition, src_addition, src_count,
...@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, ...@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// to fix max value // to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0, // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value. // means max value.
__bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count); __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
__bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
src_count * floatDchar); src_count * floatDchar);
// get low 23bit // get low 23bit
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
(unsigned)0x007fffff); (unsigned)0x007fffff);
// mask low 23bit is 1 // mask low 23bit is 1
__bang_cycle_band((char *)dst_addition, (char *)dst_addition, __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * floatDchar, (char *)src_addition, src_count * floatDchar,
...@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, ...@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// set 9 high bit ===> dst // set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000 // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000 // 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
__bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
// src or dst_addition // src or dst_addition
__bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
src_count * floatDchar); src_count * floatDchar);
__bang_mul_const((float *)dst, (float *)dst, -2.0, src_count); __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
__bang_bor((char *)dst, (char *)dst, (char *)dst_addition, __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
src_count * floatDchar); src_count * floatDchar);
#endif // __BANG_ARCH__ >= 300 #endif // __BANG_ARCH__ >= 300
} }
/*!
* @brief Converts float32 to half data type,
* the rounding mode on MLU200 is rd, on MLU300 is rn.
*
* @param[out] dst
* Pointer to NRAM that stores half type data.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__ inline void convertFloat2half(half *dst, float *src,
int src_count) {
#if __BANG_ARCH__ >= 300
__bang_float2half_rn(dst, src, src_count);
#else
__bang_float2half_rd(dst, src, src_count);
#endif
}
#endif // COMMON_MLU_HELPER_HPP_ #endif // COMMON_MLU_HELPER_HPP_
...@@ -9,14 +9,9 @@ ...@@ -9,14 +9,9 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/ *************************************************************************/
#include "common_mlu_helper.hpp" #include "nms_utils.hpp"
#define NMS_SIZE (64)
#define COORD_DIM (4) #define COORD_DIM (4)
#define MEMORY_CORE (0x80)
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024) #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE) #define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
...@@ -24,348 +19,129 @@ ...@@ -24,348 +19,129 @@
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF]; __nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF]; __mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
__mlu_func__ void pvLock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_lock(0, 0);
}
#endif
}
__mlu_func__ void pvUnlock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_unlock(0, 0);
}
#endif
}
enum Addr { SRAM, GDRAM }; enum Addr { SRAM, GDRAM };
template <typename IN_DT, typename OUT_DT> template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection( __mlu_func__ void nms_detection(
uint32_t *output_box_num, const int output_mode, const int input_layout, uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
OUT_DT *output_data, const Addr dst, IN_DT *input_data_score, IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
const IN_DT *input_data_box, const Addr src, IN_DT *buffer, IN_DT *sram, const int core_limit, const int input_num_boxes,
const int buffer_size, IN_DT *sram, const int core_limit, const int max_output_size, const float thresh_iou, const float thresh_score,
const int input_box_num, const int input_stride, const int output_stride,
const int keepNum, const float thresh_iou, const float thresh_score,
const float offset, const int algo) { const float offset, const int algo) {
// global value, it is stored in sram with a offset from the begin. // global value
const int flag_offset_size = 28; int32_t *exit_flag = (int32_t *)(sram + 28);
int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size); exit_flag[0] = 0;
loop_end_flag[0] = 0;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
const int nms_buffer_count1 = 9; int nms_buffer_count1 = 9;
// temp nram buffer to store selected target. // temp nram buffer to store selected target.
const int nram_save_limit_count = 256; int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou; float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr // input data ptr
IN_DT *input_score_ptr; const IN_DT *input_x1_ptr = input_data_box;
const IN_DT *input_x1_ptr; const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_y1_ptr; const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr; const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
const IN_DT *input_y2_ptr;
input_score_ptr = input_data_score;
input_x1_ptr = input_data_box;
if (input_layout == 0) {
// [boxes_num, 4]
input_y1_ptr = input_x1_ptr + 1;
input_x2_ptr = input_x1_ptr + 2;
input_y2_ptr = input_x1_ptr + 3;
} else if (input_layout == 1) {
// [4, boxes_num]
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
}
// nram data ptr
IN_DT *x1;
IN_DT *y1;
IN_DT *x2;
IN_DT *y2;
IN_DT *score;
IN_DT *inter_x1;
IN_DT *inter_y1;
IN_DT *inter_x2;
IN_DT *inter_y2;
IN_DT *max_box; // the max score, x1, y1, x2, y2
IN_DT *x1_mask;
IN_DT *y1_mask;
IN_DT *x2_mask;
IN_DT *y2_mask;
OUT_DT *nram_save;
int limit = 0; // find limit when GDRAM or SRAM int limit = 0; // find limit when GDRAM or SRAM
int len_core = 0; // the length deal by every core
int max_seg_pad = 0; // the max length every repeat int max_seg_pad = 0; // the max length every repeat
int repeat = 0; int repeat = 0;
int remain = 0; int remain = 0;
int remain_pad = 0; int remain_pad = 0;
int input_offset = 0; // offset of input_data for current core int input_offset = 0; // offset of input_data for current core
int nram_save_count = 0; int nram_save_count = 0;
// mask for collect x1, y1, x2, y2. each mask has 128 elements
const int mask_size = 128;
const int total_mask_size = 512;
if (output_mode == 0) { if (output_mode == 0) {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT) - nram_save_limit_count * sizeof(OUT_DT)) /
total_mask_size * sizeof(IN_DT)) /
(nms_buffer_count1 * sizeof(IN_DT)); (nms_buffer_count1 * sizeof(IN_DT));
} else { } else {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - // 5 maens: score, x1, y1, x2, y2
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) - limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
total_mask_size * sizeof(IN_DT)) / nram_save_limit_count * 5 * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT)); (nms_buffer_count1 * sizeof(IN_DT));
} }
if (core_limit == 1) { int max_seg_iou_compute = 0;
len_core = input_box_num; int repeat_iou_compute = 0;
input_offset = 0; int remain_iou_compute = 0;
} else { int remain_pad_iou_compute = 0;
int avg_core = input_box_num / core_limit;
int rem = input_box_num % core_limit;
len_core = avg_core + (taskId < rem ? 1 : 0);
input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
}
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
repeat = len_core / max_seg_pad;
remain = len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is half, we should convert it to float when compute the IoU getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
int max_seg_iou_compute = input_offset, max_seg_pad, repeat, remain,
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); remain_pad, max_seg_iou_compute, repeat_iou_compute,
int repeat_iou_compute = len_core / max_seg_iou_compute; remain_iou_compute, remain_pad_iou_compute);
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
// initial the address point
score = buffer;
x1 = score + max_seg_pad;
y1 = x1 + max_seg_pad;
x2 = y1 + max_seg_pad;
y2 = x2 + max_seg_pad;
inter_x1 = y2 + max_seg_pad;
inter_y1 = inter_x1 + max_seg_pad;
inter_x2 = inter_y1 + max_seg_pad;
inter_y2 = inter_x2 + max_seg_pad;
x1_mask = inter_y2 + max_seg_pad;
y1_mask = x1_mask + mask_size;
x2_mask = y1_mask + mask_size;
y2_mask = x2_mask + mask_size;
max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2
// offset two line from max_box
nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
// set mask for __bang_collect instruction // init the data ptr
if (input_layout == 0) { IN_DT *score = (IN_DT *)nram_buffer;
__nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0); IN_DT *x1 = score + max_seg_pad;
for (int idx = 0; idx < mask_size; idx++) { IN_DT *y1 = x1 + max_seg_pad;
int index = (idx % COORD_DIM) * mask_size + idx; IN_DT *x2 = y1 + max_seg_pad;
x1_mask[index] = (IN_DT)1.0; IN_DT *y2 = x2 + max_seg_pad;
} IN_DT *inter_x1 = y2 + max_seg_pad;
} IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0 #if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
if (core_limit != 1) { if (core_limit != 1) {
__sync_cluster(); // sync before current loop __sync_cluster(); // sync before current loop
} }
/******find max start******/ /******FIND MAX START******/
int max_index = 0; // the max score index int max_index = 0; // the max score index
int global_max_index = 0; // for U1 int global_max_index = 0; // for U1
float max_area = 0; // the max score area float max_area = 0; // the max socre area
max_box[0] = 0; // init 0 max_box[0] = 0; // init 0
findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
for (int i = 0; i <= repeat; i++) { input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
if (i == repeat && remain == 0) { input_offset, repeat, remain, remain_pad, max_seg_pad,
break; max_index);
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num that half data type could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
// seg length exceeds the max num for fp16 datatype!
return;
}
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
/******nms load start******/
mluMemcpyDirection_t load_dir = SRAM2NRAM;
if (src == SRAM) {
load_dir = SRAM2NRAM;
} else {
load_dir = GDRAM2NRAM;
}
__nramset(score, seg_len, (IN_DT)0);
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
/******nms load end******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
int stride = 1;
if (input_layout == 0) {
stride = input_stride;
} else if (input_layout == 1) {
stride = 1;
}
if (core_limit == 1) { if (core_limit == 1) {
max_box[1] = input_x1_ptr[max_index * stride]; #if __BANG_ARCH__ >= 300
max_box[2] = input_y1_ptr[max_index * stride]; calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box[3] = input_x2_ptr[max_index * stride]; max_box_x2, max_box_y2);
max_box[4] = input_y2_ptr[max_index * stride]; #else
if (algo == 0 || offset == 0.0) { calMaxArea(max_box, algo, offset, max_area);
max_area = ((float)max_box[3] - (float)max_box[1]) * #endif
((float)max_box[4] - (float)max_box[2]); input_data_score[max_index] = 0;
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
input_score_ptr[max_index] = 0;
global_max_index = max_index; global_max_index = max_index;
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
} else if (core_limit == 4) { } else if (core_limit == 4) {
// find the max with sram
// the max box's x1, y1, x2, y2 on every core
if (coreId != MEMORY_CORE) {
max_box[1] = input_x1_ptr[max_index * stride];
max_box[2] = input_y1_ptr[max_index * stride];
max_box[3] = input_x2_ptr[max_index * stride];
max_box[4] = input_y2_ptr[max_index * stride];
}
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
for (int i = 0; i < INFO_NUM; i++) {
__memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
NRAM2SRAM);
}
// copy every core's max_index to sram, use 2 half to store max_index
__memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
sizeof(uint32_t),
NRAM2SRAM); // int32_t datatype
__sync_cluster(); __sync_cluster();
findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
// copy score from sram to nram and find the max #if __BANG_ARCH__ >= 300
__nramset(inter_x1, NMS_SIZE, (IN_DT)0); calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
__memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM); max_box_x2, max_box_y2);
__bang_max(max_box, inter_x1, NMS_SIZE); #else
int max_core = 0; calMaxArea(max_box, algo, offset, max_area);
if (sizeof(IN_DT) == sizeof(half)) { #endif
max_core = ((uint16_t *)max_box)[1]; global_max_index = ((uint32_t *)(max_box + 5))[0];
} else if (sizeof(IN_DT) == sizeof(float)) { input_data_score[global_max_index] = 0;
max_core = ((uint32_t *)max_box)[1];
}
// copy the max box from SRAM to NRAM
__memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x1
__memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y1
__memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x2
__memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y2
__memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
sizeof(uint32_t), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
input_score_ptr[global_max_index] = 0;
} }
// by now, we get: max_score|max_index|max_box|max_area // by now, we get: max_score|max_index|max_box|max_area
/******find max end******/ /******FIND MAX END******/
/******nms store start******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
__memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
1 * sizeof(uint32_t), 0);
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
(*output_box_num)++;
}
// store to sram/gdram storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
if (*output_box_num != 0) { max_output_size, thresh_score, output_mode, nram_save_count,
mluMemcpyDirection_t store_dir = NRAM2GDRAM; output_box_num);
if (dst == SRAM) {
store_dir = NRAM2SRAM;
} else { // dst == GDRAM
store_dir = NRAM2GDRAM;
}
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
if (nram_save_count != 0) {
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * sizeof(uint32_t), store_dir);
pvUnlock();
output_data += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
pvUnlock();
output_data += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
store_dir, output_stride * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_data += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
// if the max score <= 0, end // if the max score <= 0, end
if (core_limit == 1) { if (core_limit == 1) {
...@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection( ...@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection(
} else { } else {
if (float(max_box[0]) <= thresh_score) { if (float(max_box[0]) <= thresh_score) {
if (coreId == 0) { if (coreId == 0) {
loop_end_flag[0] = 1; exit_flag[0] = 1;
} }
} }
__sync_cluster(); __sync_cluster();
if (loop_end_flag[0] == 1) { if (exit_flag[0] == 1) {
break; break;
} }
} }
/******nms store end******/ /******NMS STORE END******/
#if __BANG_ARCH__ >= 300
// To solve half data accuracy, we convert half to float to calculate IoU. scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
for (int i = 0; i <= repeat_iou_compute; i++) { input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
if (i == repeat_iou_compute && remain_iou_compute == 0) { inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
break; max_box_y1, max_box_x2, max_box_y2, nram_save,
} repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
int seg_len = 0; // the length every nms compute max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
int cpy_len = 0; // the length every nms memcpy input_offset, offset, max_area, input_num_boxes, algo);
i == repeat_iou_compute ? seg_len = remain_pad_iou_compute #else
: seg_len = max_seg_iou_compute; scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
i == repeat_iou_compute ? cpy_len = remain_iou_compute input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
: cpy_len = max_seg_iou_compute; inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
max_box[2], max_box[3], max_box[4], nram_save,
/******nms load start******/ repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
mluMemcpyDirection_t load_dir = SRAM2NRAM; max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
if (src == SRAM) { input_offset, offset, max_area, input_num_boxes, algo);
load_dir = SRAM2NRAM; #endif
} else { } // for max_output_size
load_dir = GDRAM2NRAM;
}
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
if (input_layout == 0) {
// the following number 4 means x1, y1, x2, y2
__memcpy(
inter_x1,
input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
cpy_len * COORD_DIM * sizeof(IN_DT),
cpy_len * COORD_DIM * sizeof(IN_DT), 0);
// here use collect instruction to transpose the [n, 4] shape into [4,
// n] shape to avoid
// discrete memory accessing.
for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
// the following number 32 means 32 elements will be selected out by
// once operation
__bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x1_mask, mask_size);
__bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y1_mask, mask_size);
__bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x2_mask, mask_size);
__bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y2_mask, mask_size);
}
} else if (input_layout == 1) {
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y1 + dt_offset,
input_y1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(x2 + dt_offset,
input_x2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y2 + dt_offset,
input_y2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
}
/******nms load end******/
/******nms compute start******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******nms compute end******/
// update the score
mluMemcpyDirection_t update_dir = NRAM2SRAM;
if (dst == SRAM) {
update_dir = NRAM2SRAM;
} else {
update_dir = NRAM2GDRAM;
}
if (sizeof(IN_DT) == sizeof(half)) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for keepNum
} }
__mlu_global__ void MLUUnion1KernelNMS( __mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence, const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_stride, const int input_num_boxes, const int max_output_size,
const int max_output_size, const float iou_threshold, const float iou_threshold, const float confidence_threshold,
const float confidence_threshold, const int mode, const int input_layout, const int output_mode, void *workspace, void *result_num, void *output,
void *workspace, void *result_num, void *output,
const cnrtDataType_t data_type_input, const float offset, const int algo) { const cnrtDataType_t data_type_input, const float offset, const int algo) {
if (data_type_input == CNRT_FLOAT16) { if (data_type_input == CNRT_FLOAT16) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half), __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
...@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS( ...@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS(
} else { } else {
} }
int output_stride = max_output_size; uint32_t output_box_num = 0;
uint32_t result_box_num = 0; float *score_data = (float *)workspace;
if (mode == 0) { float *boxes_data = (float *)input_boxes;
uint32_t *out_data = (uint32_t *)output; float *sram = (float *)sram_buffer;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *boxes_data = (half *)input_boxes;
half *confi_data = (half *)workspace;
half *buffer = (half *)nram_buffer;
half *sram = (half *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, if (output_mode == 0) {
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, if (data_type_input == CNRT_FLOAT32) {
sram, taskDim, input_num_boxes, input_stride, nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
output_stride, max_output_size, iou_threshold, boxes_data, GDRAM, sram, taskDim, input_num_boxes,
confidence_threshold, offset, algo); max_output_size, iou_threshold, confidence_threshold,
((uint32_t *)result_num)[0] = result_box_num; offset, algo);
}; break; } else {
nms_detection(output_box_num, output_mode, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
} }
} else { } else {
switch (data_type_input) { if (data_type_input == CNRT_FLOAT32) {
default: { return; } nms_detection(output_box_num, output_mode, (float *)output, score_data,
case CNRT_FLOAT16: { boxes_data, GDRAM, sram, taskDim, input_num_boxes,
half *boxes_data = (half *)input_boxes; max_output_size, iou_threshold, confidence_threshold,
half *confi_data = (half *)workspace; offset, algo);
half *out_data = (half *)output; } else {
half *buffer = (half *)nram_buffer; nms_detection(output_box_num, output_mode, (half *)output,
half *sram = (half *)sram_buffer; (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, confidence_threshold, offset, algo);
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *out_data = (float *)output;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
} }
} }
((uint32_t *)result_num)[0] = output_box_num;
} }
template <typename IN_DT, typename OUT_DT> template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux( __mlu_func__ void nms_detection_ux(
int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram, int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram, IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_layout, const int input_num_boxes, const int input_stride, const int input_num_boxes, const int max_output_size,
const int max_output_size, const float thresh_iou, const float thresh_score, const float thresh_iou, const float thresh_score, const float offset,
const float offset, const int output_mode, const int algo) { const int output_mode, const int algo) {
loop_end_flag[0] = 0; exit_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer; IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
...@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux( ...@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux(
float div_thresh_iou = 1.0 / thresh_iou; float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr // input data ptr
IN_DT *input_score_ptr; const IN_DT *input_x1_ptr = boxes_data;
const IN_DT *input_x1_ptr; const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_y1_ptr; const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr; const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
const IN_DT *input_y2_ptr;
input_score_ptr = score_data;
input_x1_ptr = boxes_data;
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
int limit = 0; // find limit when GDRAM or SRAM int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat int max_seg_pad = 0; // the max length every repeat
...@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux( ...@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux(
(nms_buffer_count1 * sizeof(IN_DT)); (nms_buffer_count1 * sizeof(IN_DT));
} }
// data split int input_offset = 0;
int avg_cluster = input_num_boxes / clusterDim; int max_seg_iou_compute = 0;
int rem_cluster = input_num_boxes % clusterDim; int repeat_iou_compute = 0;
int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0); int remain_iou_compute = 0;
int cluster_offset = avg_cluster * clusterId + int remain_pad_iou_compute = 0;
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
int input_offset = cluster_offset + core_offset;
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_avg_core = input_num_boxes / clusterDim;
int max_index_rem_core = input_num_boxes % clusterDim;
int max_index_len_core =
max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
int max_index_input_offset =
max_index_avg_core * clusterId +
(clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
max_seg_pad, repeat, remain, remain_pad,
max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the nram ptr // init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer; IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad; IN_DT *x1 = score + max_seg_pad;
...@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux( ...@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux(
OUT_DT *nram_save = OUT_DT *nram_save =
(OUT_DT *)((char *)max_box + (OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box NFU_ALIGN_SIZE); // offset two line from max_box
#if __BANG_ARCH__ >= 300
mluMemcpyDirection_t input_load_dir = SRAM2NRAM; float max_box_x1 = 0;
mluMemcpyDirection_t input_store_dir = NRAM2SRAM; float max_box_y1 = 0;
input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM; float max_box_x2 = 0;
input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM; float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size; for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0 keep++) { // loop until the max_score <= 0
__sync_all(); __sync_all();
/******FIND MAX START******/
int max_index = 0; int max_index = 0;
int global_max_index = 0; // for Ux int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area float max_area = 0; // the max socre area
max_box[0] = 0; // init 0 max_box[0] = 0; // init 0
if (coreId == 0) { if (coreId == 0) {
for (int i = 0; i <= repeat; i++) { findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
if (i == repeat && remain == 0) { input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
break; input_offset, repeat, remain, remain_pad, max_seg_pad,
} max_index);
int seg_len = (i == repeat)
? remain_pad
: max_seg_pad; // the length every nms compute
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num
// that fp16 could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
return;
}
int cpy_len = (i == repeat)
? remain
: max_seg_pad; // the length every nms memcpy
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score,
input_score_ptr + max_index_input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index =
((uint16_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index =
((uint32_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every cluster
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
// copy max box info to sram // copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
} }
__sync_all(); __sync_all();
// copy all partial max to the sram of cluster 0 #if __BANG_ARCH__ <= 372
if (clusterId != 0) { findGlobalMaxBox(max_box, sram, inter_x1);
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT), #endif
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
// copy the global max box to max_box #if __BANG_ARCH__ >= 300
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
if (algo == 0 || offset == 0.0) { max_box_x2, max_box_y2);
max_area = ((float)max_box[3] - (float)max_box[1]) * #else
((float)max_box[4] - (float)max_box[2]); calMaxArea(max_box, algo, offset, max_area);
} else { #endif
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + 5))[0]; global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId != 0x80) { if (coreId != MEMORY_CORE) {
input_score_ptr[global_max_index] = 0; score_data[global_max_index] = 0;
} }
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
/******NMS STORE START******/ storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
// store to nram max_output_size, thresh_score, output_mode, nram_save_count,
if (float(max_box[0]) > thresh_score) { output_box_num);
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
if (float(max_box[0]) <= thresh_score) { if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) { if (clusterId == 0 && coreId == 0) {
loop_end_flag[0] = 1; // dram exit_flag[0] = 1; // dram
} }
} }
__sync_all(); __sync_all();
if (loop_end_flag[0] == 1) { if (exit_flag[0] == 1) {
break; break;
} }
/******NMS STORE END******/ /******NMS STORE END******/
#if __BANG_ARCH__ >= 300
// To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU. scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
for (int i = 0; i <= repeat_iou_compute; i++) { input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
if (i == repeat_iou_compute && remain_iou_compute == 0) { inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
break; max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
} remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
: max_seg_iou_compute; max_area, input_num_boxes, algo);
int cpy_len = #else
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute; scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
/******NMS LOAD START******/ inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
__nramset((float *)score, seg_len, 0.0f); max_box[3], max_box[4], nram_save, repeat_iou_compute,
int dt_offset = 0; remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
if (sizeof(IN_DT) == sizeof(float)) { max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, max_area, input_num_boxes, algo);
cpy_len * sizeof(IN_DT), input_load_dir, #endif
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); } // for max_output_size
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
/******NMS LOAD END******/
/******NMS COMPUTE START******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
if (sizeof(IN_DT) == 2) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), input_store_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for max_output_size
} }
__mlu_global__ void MLUUionXKernelNMS( __mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence, const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_layout, const int input_stride, const int input_num_boxes, const int max_output_size,
const int max_output_size, const float iou_threshold, const float iou_threshold, const float confidence_threshold,
const float confidence_threshold, const float offset, const float offset, const cnrtDataType_t data_type_input,
const cnrtDataType_t data_type_input, const int output_mode, const int algo, const int output_mode, const int algo, void *workspace, void *result_num,
void *workspace, void *result_num, void *output) { void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2; int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *loop_end_flag = int32_t *exit_flag = (int32_t *)((char *)workspace +
(int32_t *)((char *)workspace + INFO_NUM * input_num_boxes * input_dwidth);
INFO_NUM * input_num_boxes * input_dwidth);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth; int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size; int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
...@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS( ...@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS(
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM); __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
} }
__sync_cluster(); __sync_cluster();
uint32_t output_box_num = 0; uint32_t output_box_num = 0;
float *score_data;
float *boxes_data;
score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
if (output_mode == 0) { if (output_mode == 0) {
uint32_t *output_dram = (uint32_t *)output; if (data_type_input == CNRT_FLOAT32) {
switch (data_type_input) { nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
default: { return; } score_data, boxes_data, input_ram, input_num_boxes,
case CNRT_FLOAT16: { max_output_size, iou_threshold, confidence_threshold,
half *score_data; offset, output_mode, algo);
half *boxes_data; } else {
score_data = nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace; (half *)score_data, (half *)boxes_data, input_ram,
boxes_data = input_num_boxes, max_output_size, iou_threshold,
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; confidence_threshold, offset, output_mode, algo);
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
} }
} else { } else {
switch (data_type_input) { if (data_type_input == CNRT_FLOAT32) {
default: { return; } nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
case CNRT_FLOAT16: { boxes_data, input_ram, input_num_boxes, max_output_size,
half *output_dram = (half *)output; iou_threshold, confidence_threshold, offset, output_mode,
half *score_data; algo);
half *boxes_data; } else {
score_data = nms_detection_ux(exit_flag, output_box_num, (half *)output,
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace; (half *)score_data, (half *)boxes_data, input_ram,
boxes_data = input_num_boxes, max_output_size, iou_threshold,
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; confidence_threshold, offset, output_mode, algo);
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *output_dram = (float *)output;
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
} }
} }
((uint32_t *)result_num)[0] = output_box_num;
} }
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr, const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes, const void *scores_ptr, const int input_num_boxes,
const int input_stride, const int max_output_boxes, const int max_output_boxes, const float iou_threshold,
const float iou_threshold, const float offset, const float offset, void *workspace_ptr, void *output_size_ptr,
void *workspace_ptr, void *output_size_ptr, void *output_ptr) { void *output_ptr) {
switch (k_type) { switch (k_type) {
default: { return; } default: { return; }
case CNRT_FUNC_TYPE_BLOCK: case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: { case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>( MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, input_stride, (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0, /*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
/*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1); data_type_input, offset, /*algo=*/1);
}; break; }; break;
case CNRT_FUNC_TYPE_UNION2: case CNRT_FUNC_TYPE_UNION2:
...@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, ...@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
case CNRT_FUNC_TYPE_UNION8: case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: { case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>( MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1, (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
input_stride, max_output_boxes, iou_threshold, max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
/*confidence_threshold=*/0.0, offset, data_type_input, data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
/*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr, output_size_ptr, output_ptr);
output_ptr);
}; break; }; break;
} }
} }
/*************************************************************************
* Copyright (C) [2019-2022] by Cambricon, Inc.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef NMS_UTILS_HPP_
#define NMS_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define NMS_SIZE (64)
#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define NMS_DOWN(x, y) (x / y) * y
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define MEMORY_CORE (0x80)
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
__mlu_func__ void pvLock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_lock(0, 0);
}
#endif
}
__mlu_func__ void pvUnlock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_unlock(0, 0);
}
#endif
}
template <typename T>
static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
const int deal_num,
const T threshold = 0) {
if (threshold < 0) {
return;
}
if (threshold) {
#if __BANG_ARCH__ >= 300
__bang_relun(nram_dst, nram_src, deal_num, threshold);
#else
int align_num = NFU_ALIGN_SIZE / sizeof(T);
T *nram_aux_a = (T *)nram_tmp;
T *nram_aux_b = nram_aux_a + deal_num;
T *nram_zero = nram_aux_b + align_num;
__bang_write_value(nram_aux_b, align_num, threshold);
__bang_write_zero(nram_zero, align_num);
__bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
align_num);
__bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
__bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
align_num);
__bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
deal_num, align_num);
__bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
__bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
align_num);
__bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
#endif
} else {
#if __BANG_ARCH__ >= 300
__bang_relu(nram_dst, nram_src, deal_num);
#else
__bang_active_relu(nram_dst, nram_src, deal_num);
#endif
}
}
__mlu_func__ void getComputeParamsBlockOrU1(
const int input_dwidth, const int input_box_num, const int limit,
const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
int &remain, int &remain_pad, int &max_seg_iou_compute,
int &repeat_iou_compute, int &remain_iou_compute,
int &remain_pad_iou_compute) {
int avg_core = input_box_num / core_limit;
int rem = input_box_num % core_limit;
int len_core = avg_core + (coreId < rem ? 1 : 0);
input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
repeat = len_core / max_seg_pad;
remain = len_core % max_seg_pad;
remain_pad = NMS_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
repeat_iou_compute = len_core / max_seg_iou_compute;
remain_iou_compute = len_core % max_seg_iou_compute;
remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
}
__mlu_func__ void getComputeParamsUx(
const int input_dwidth, const int input_num_boxes, const int limit,
int &input_offset, int &max_seg_pad, int &repeat, int &remain,
int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
int &remain_iou_compute, int &remain_pad_iou_compute) {
// data split
int avg_cluster = input_num_boxes / clusterDim;
int rem_cluster = input_num_boxes % clusterDim;
int len_cluster = avg_cluster + (clusterId < rem_cluster);
int cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
input_offset = cluster_offset + core_offset;
max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = NMS_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute =
NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
repeat_iou_compute = len_core / max_seg_iou_compute;
remain_iou_compute = len_core % max_seg_iou_compute;
remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
}
template <typename IN_DT>
__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
IN_DT *inter_x1) {
// copy all partial max to the sram of cluster 0
if (clusterId != 0) {
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
// copy the global max box to max_box
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
}
template <typename IN_DT>
__mlu_func__ void findCoreMaxBox(
IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
const mluMemcpyDirection_t load_dir, const int input_offset,
const int repeat, const int remain, const int remain_pad,
const int max_seg_pad, int &max_index) {
if (coreId != 0x80) {
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every core
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
}
}
template <typename IN_DT>
__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
IN_DT *inter_x1, IN_DT *input_data_score,
const int core_limit) {
// find the max with sram
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
__memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
NRAM2SRAM); // int32_t datatype
__sync_cluster();
// copy score from sram to nram and find the max
__bang_write_zero(inter_x1, 64);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
__bang_max(max_box, inter_x1, 64);
int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
// copy the max box to max_box
__memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
SRAM2NRAM);
}
/*****************************************************************************/
/*******************************CALCULATE MAX AREA****************************/
/*****************************************************************************/
template <typename IN_DT>
__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
float &max_area) {
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
}
template <typename IN_DT>
__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
float &max_area, float &max_box_x1,
float &max_box_y1, float &max_box_x2,
float &max_box_y2) {
// the case of random inf will break the requirement of x1<=x2, y1<=y2
// so exchange it if it happens.
max_box_x1 = float(max_box[1]);
max_box_x2 = float(max_box[3]);
if (max_box[1] > max_box[3]) {
max_box_x1 = float(max_box[3]);
max_box_x2 = float(max_box[1]);
}
max_box_y1 = float(max_box[2]);
max_box_y2 = float(max_box[4]);
if (max_box[2] > max_box[4]) {
max_box_y1 = float(max_box[4]);
max_box_y2 = float(max_box[2]);
}
if (algo == 0 || offset == 0.0) {
max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
} else {
max_area =
(max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
}
}
/***********************************************************************/
/*******************************STORE RESULT****************************/
/***********************************************************************/
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
OUT_DT *&output_dram, const int keep,
const int nram_save_limit_count,
const int max_output_size,
const float thresh_score, const int output_mode,
int &nram_save_count, uint32_t &output_box_num) {
/******NMS STORE START******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void scoreUpdate(
IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
const float max_box_y1, const float max_box_x2, const float max_box_y2,
OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
const float thresh_iou, const float div_thresh_iou, const int input_offset,
const float offset, const float max_area, const int input_num_boxes,
const int algo) {
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
: max_seg_iou_compute;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
/******NMS LOAD START******/
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
#if __BANG_ARCH__ >= 300
__memcpy(inter_x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
input_num_boxes * sizeof(IN_DT), 3);
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)inter_x1,
(half *)inter_x1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)inter_y1,
(half *)inter_y1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)inter_x2,
(half *)inter_x2 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)inter_y2,
(half *)inter_y2 + max_seg_iou_compute, seg_len);
}
// box transfer
__bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
__bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
__bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
__bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
// 1、 compute IOU
// get the area_I
__bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
seg_len); // inter_x1
__bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
seg_len); // inter_w
__bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
seg_len); // inter_y1
__bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
if (algo == 1 && offset != 0.0) {
__bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
offset, seg_len, seg_len);
__bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
offset, seg_len, seg_len);
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
} else {
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
(float *)inter_y1, seg_len, seg_len);
}
// get the area_U: area + max_area - area_I
__bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
(float *)inter_x1, seg_len, seg_len);
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
// process for nan
__bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
__bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
#else
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
input_num_boxes * sizeof(IN_DT), 3);
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
}
// 1、 compute IOU
// get the area_I
__bang_write_value((float *)inter_y1, seg_len,
float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__bang_write_value((float *)inter_y2, seg_len,
float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
seg_len); // inter_w
__bang_write_value((float *)inter_x2, seg_len,
float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__bang_write_value((float *)inter_x2, seg_len,
float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thresh, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
#endif
// update the score
if (sizeof(IN_DT) == sizeof(half)) {
convertFloat2half((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
pvUnlock();
}
}
#endif // NMS_UTILS_HPP_
...@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src, ...@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
int w_seg = position.w_end - position.w_start; int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c; int size = h_seg * w_seg * shape_full.c;
__memcpy(dst, __memcpy(dst, src + position.n_start * n_offset +
src + position.n_start * n_offset + position.h_start * h_offset + position.h_start * h_offset + position.w_start * w_offset,
position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T), size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1); n_seg - 1);
} }
...@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward( ...@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
int elem_count = int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c, CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T)); NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram, elem_count, (T)0); __bang_write_value(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c; int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c; int y_h_offset = shape_seg.w * shape_seg.c;
...@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward( ...@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T)); CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count = int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T)); CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram_temp, elem_count, (T)0); __bang_write_value(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c; int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c; int y_h_offset = shape_seg.w * align_c;
...@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward( ...@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
int elem_count = int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c, CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T)); NFU_ALIGN_SIZE / sizeof(T));
__nramset(dx_nram, elem_count, (T)0); __bang_write_value(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c; int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c; int dy_h_offset = shape_seg.w * dy_full.c;
...@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward( ...@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
// fill zeros to dx // fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c; T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c; int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0); __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c; int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c; int dy_h_offset_seg = shape_seg.w * align_c;
......
...@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, ...@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM); __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4 // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel); __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel); __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel); __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel); __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel); __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel); __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
...@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, ...@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
} // loop_roi_grid_w } // loop_roi_grid_w
} // loop_roi_grid_h } // loop_roi_grid_h
T count_value = (T)(1.0 / count); T count_value = (T)(1.0 / count);
__bang_mul_const(nram_out, nram_out, count_value, align_channel); __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM); __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num } // loop_cyc_num
} }
...@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg( ...@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
case CNRT_FLOAT16: { case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned, roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height, channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, input_width, sampling_ratio, (half)spatial_scale,
(half)spatial_scale, num_rois); num_rois);
}; break; }; break;
case CNRT_FLOAT32: { case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output, roialignForwardAvg((float *)input, (float *)rois, (float *)output,
...@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp( ...@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
&x_high, &y_low, &y_high); &x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) { if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM); __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c, image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c, image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c, image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c, image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
...@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp( ...@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
} }
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T), __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM); GDRAM2NRAM);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once, image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c); (T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once, image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c); (T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once, image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c); (T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once, image_offset + y_high * wo * c + x_high * c + i * deal_once,
......
...@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, ...@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
} }
if (is_empty) { if (is_empty) {
__nramset((T *)nram_out, c_slice_align, (T)0); __bang_write_value((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM); c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) { if (NULL != argmax) {
__nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset, __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM); (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
} }
...@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, ...@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
for (int i = 0; i < c_slice; i++) { for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim); nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
} }
__bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1, __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align); c_slice_align);
__bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width, __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align); c_slice_align);
/*compute input_w*/ /*compute input_w*/
__bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim, __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align); c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a, __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align); c_slice_align);
__bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1, __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align); c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a, __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align); c_slice_align);
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a, convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
...@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type, ...@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
rois_num, (float)spatial_scale, (float *)output_data, rois_num, (float)spatial_scale, (float *)output_data,
argmax); argmax);
}; break; }; break;
default: { default: { break; }
break;
}
} }
} }
} // namespace forward } // namespace forward
...@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex( ...@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
align_c); align_c);
// Perform 'temp_result - hstart' operation // Perform 'temp_result - hstart' operation
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c); align_c);
// Perform 'temp_result1 - temp_result2 * width' operation // Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c); align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
(float *)nram_argmax_fp_w, align_c); (float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation // Perform 'temp_result - wstart' operation
__bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart, __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
align_c); wstart, align_c);
// Perform 'temp_result = h * w_compute + w' operation // Perform 'temp_result = h * w_compute + w' operation
__bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c); w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c); (float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) { if (loop_flag == 1) {
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c); (loop_id * true_limit), align_c);
} }
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1, convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2, (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
...@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, ...@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/ */
// Load the data from GDRAM to NRAM. // Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision, __memcpy(
(const T *)grads + (n * pooled_height * pooled_width + (T *)nram_grads + align_c * high_precision,
ph * pooled_width + pw) * (const T *)grads +
channels, (n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels * sizeof(T), GDRAM2NRAM); channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) { if (high_precision) {
__bang_half2float((float *)nram_grads, __bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision, (half *)nram_grads + align_c * high_precision,
align_c); align_c);
} }
__memcpy((int32_t *)nram_argmax, __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(const int32_t *)argmax + (n * pooled_height * pooled_width + (n * pooled_height * pooled_width +
ph * pooled_width + pw) * ph * pooled_width + pw) *
channels, channels,
channels * sizeof(int32_t), GDRAM2NRAM); channels * sizeof(int32_t), GDRAM2NRAM);
// Perform pooling operation on NRAM. // Perform pooling operation on NRAM.
...@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, ...@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/ */
// Load the data from GDRAM to NRAM. // Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision, __memcpy(
(const T *)grads + (n * pooled_height * pooled_width + (T *)nram_grads + align_c * high_precision,
ph * pooled_width + pw) * (const T *)grads +
channels, (n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels * sizeof(T), GDRAM2NRAM); channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) { if (high_precision) {
__bang_half2float((float *)nram_grads, __bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision, (half *)nram_grads + align_c * high_precision,
align_c); align_c);
} }
__memcpy((int32_t *)nram_argmax, __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(const int32_t *)argmax + (n * pooled_height * pooled_width + (n * pooled_height * pooled_width +
ph * pooled_width + pw) * ph * pooled_width + pw) *
channels, channels,
channels * sizeof(int32_t), GDRAM2NRAM); channels * sizeof(int32_t), GDRAM2NRAM);
int ping_pong = 0; int ping_pong = 0;
...@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward( ...@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
height, width, pooled_height, pooled_width, rois_num, height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision); (const float)spatial_scale, high_precision);
}; break; }; break;
default: { default: { break; }
break;
}
} }
} }
} // namespace backward } // namespace backward
......
...@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift( ...@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
int t_shift = shifts[n_index * group_size + group_id]; int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size + int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size; n_index * time_size * channel_size * hw_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0); __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;"); __asm__ volatile("sync;");
if (abs(t_shift) >= time_size) { if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
...@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence( ...@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
int next_sequence_index = int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size; index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size; int cur_sequence_index = index / hw_size / channel_size % time_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0); __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;"); __asm__ volatile("sync;");
if (max_number_hw_per_core == 0) { if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment