Commit e847cf8a authored by bdf's avatar bdf Committed by Zaida Zhou
Browse files

[Refactor] Adapt mlu code to cntoolkit3.0.1

parent 4c6e99c8
...@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left // right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride); __bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride); __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right // bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride); __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride); __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom // zero vector ---> bottom
__nramset(vec_bottom, batches_stride, 0.f); __bang_write_value(vec_bottom, batches_stride, 0.f);
// width --> vec_left // width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area // get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top // (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride); __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom // (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top; // ---> vec_top;
...@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area // get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1 // (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1 // (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1; // ---> b2_x1;
...@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height; T *inter_s = height;
// offset vector ---> vec_b2_y1 // offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset)); __bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1; T *vec_offset = vec_b2_y1;
if (mode == 0) { if (mode == 0) {
...@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
int32_t base1 = b1 * COORD_NUM; int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram // set bbox1 and bbox2 to nram
__nramset(vec_b1_x1, batches_stride, bbox1[base1]); __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
__nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]); __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]); __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]); __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) { for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride; int32_t index2 = j * batches_stride;
...@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left // right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride); __bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride); __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right // bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride); __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride); __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom // zero vector ---> bottom
__nramset(vec_bottom, batches_stride, (T)0); __bang_write_value(vec_bottom, batches_stride, (T)0);
// width --> vec_left // width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area // get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top // (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride); __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom // (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top; // ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride); __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
...@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area // get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1 // (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1 // (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1; // ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
...@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow( ...@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height; T *inter_s = height;
// offset vector ---> vec_b2_y1 // offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset)); __bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1; T *vec_offset = vec_b2_y1;
if (mode == 0) { if (mode == 0) {
......
...@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask, ...@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1; blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
// set output_nram to zero // set output_nram to zero
__nramset(output_nram, param.output_nram_size, T(0)); __bang_write_value(output_nram, param.output_nram_size, T(0));
// loop blocks of kernel window: grid_dim.(Kh, Kw) // loop blocks of kernel window: grid_dim.(Kh, Kw)
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) { for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
...@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask, ...@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
T *sum = sum_array; T *sum = sum_array;
for (int g = 0; g < blkSize.G; ++g) { for (int g = 0; g < blkSize.G; ++g) {
__bang_mul_const(sum, src, mask_array[mask_index], __bang_mul_scalar(sum, src, mask_array[mask_index],
param.block_Cg_NFU); param.block_Cg_NFU);
// //
// NOTE: Since block_Cg_NFU >= block_Cg_stride, // NOTE: Since block_Cg_NFU >= block_Cg_stride,
// overlapped writing may occur on sum_array. // overlapped writing may occur on sum_array.
...@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output, ...@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index; T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T), __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
GDRAM2NRAM); GDRAM2NRAM);
__bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff, __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align); ((T *)mask_buff)[mask_index], num_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input, __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, num_align); (T *)grad_input_buff, num_align);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff, __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
...@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output, ...@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index; T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T), __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
GDRAM2NRAM); GDRAM2NRAM);
__bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff, __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align); ((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input, __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, rem_for_loop); (T *)grad_input_buff, rem_for_loop);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff, __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
...@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, ...@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
const int wi, const int c, const int k_up, const int wi, const int c, const int k_up,
const int group, const int scale) { const int group, const int scale) {
if (dtype == CNRT_FLOAT16) { if (dtype == CNRT_FLOAT16) {
backward::MLUUnion1KernelCarafeBackward<half> backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
<<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input, input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
grad_mask, n, hi, wi, c, k_up, group, scale); group, scale);
} else { } else {
backward::MLUUnion1KernelCarafeBackward<float> backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
<<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input, input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
grad_mask, n, hi, wi, c, k_up, group, scale); group, scale);
} }
} }
...@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src, ...@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
// get sign bit // get sign bit
const float move_23bit = 8388608.0; const float move_23bit = 8388608.0;
// 0x80000000 = 1,000000000,0000000000000000000000000000 // 0x80000000 = 1,000000000,0000000000000000000000000000
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000); 0x80000000);
__bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE); src_count * sizeof(float), NFU_ALIGN_SIZE);
// get 1 or 0 from sign bit // get 1 or 0 from sign bit
// judg is Odd // judg is Odd
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001); 0x00000001);
__bang_cycle_bor((char *)dst_addition, (char *)dst_addition, __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * sizeof(float), (char *)src_addition, src_count * sizeof(float),
NFU_ALIGN_SIZE); NFU_ALIGN_SIZE);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000001); 0x80000001);
__bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
// minus xor, positive num invariant // minus xor, positive num invariant
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff); 0xffffffff);
__bang_cycle_mul(dst, dst_addition, src_addition, src_count, __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float)); __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
// convert int32 to float32 // convert int32 to float32
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x7fffff);
__bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE); src_count * sizeof(float), NFU_ALIGN_SIZE);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x4b000000); 0x4b000000);
__bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
src_count * sizeof(float), NFU_ALIGN_SIZE); src_count * sizeof(float), NFU_ALIGN_SIZE);
__bang_sub_const(dst, dst, move_23bit, src_count); __bang_sub_scalar(dst, dst, move_23bit, src_count);
// add one // add one
__bang_add(dst, dst, dst_addition, src_count); __bang_add(dst, dst, dst_addition, src_count);
// set sign for float32 // set sign for float32
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xffffffff); 0xffffffff);
__bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x00000001); 0x00000001);
__bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0x80000000); 0x80000000);
__bang_cycle_band((char *)dst_addition, (char *)dst_addition, __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * 4, 128); (char *)src_addition, src_count * 4, 128);
__bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4); __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
...@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, ...@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// dst_addition = abs(src) // dst_addition = abs(src)
__bang_mul(dst_addition, src, (float *)dst, src_count); __bang_mul(dst_addition, src, (float *)dst, src_count);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error. // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f); __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
1.0f);
__bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
__bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
0xbf800000); 0xbf800000);
// set negative flag -1.0 = 0xbf80000 // set negative flag -1.0 = 0xbf80000
__bang_cycle_eq( __bang_cycle_eq(
(float *)dst, (float *)dst, (float *)src_addition, src_count, (float *)dst, (float *)dst, (float *)src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0] NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0]
__bang_active_abs(dst_addition, src, src_count); __bang_active_abs(dst_addition, src, src_count);
__nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f); __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
8388608.0f);
// mask shift move 23 // mask shift move 23
__bang_cycle_add_tz( __bang_cycle_add_tz(
dst_addition, dst_addition, src_addition, src_count, dst_addition, dst_addition, src_addition, src_count,
...@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, ...@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// to fix max value // to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0, // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value. // means max value.
__bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count); __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
__bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
src_count * floatDchar); src_count * floatDchar);
// get low 23bit // get low 23bit
__nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
(unsigned)0x007fffff); (unsigned)0x007fffff);
// mask low 23bit is 1 // mask low 23bit is 1
__bang_cycle_band((char *)dst_addition, (char *)dst_addition, __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
(char *)src_addition, src_count * floatDchar, (char *)src_addition, src_count * floatDchar,
...@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, ...@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// set 9 high bit ===> dst // set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000 // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000 // 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
__bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
NFU_ALIGN_SIZE / sizeof(float)); NFU_ALIGN_SIZE / sizeof(float));
// src or dst_addition // src or dst_addition
__bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
src_count * floatDchar); src_count * floatDchar);
__bang_mul_const((float *)dst, (float *)dst, -2.0, src_count); __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
__bang_bor((char *)dst, (char *)dst, (char *)dst_addition, __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
src_count * floatDchar); src_count * floatDchar);
#endif // __BANG_ARCH__ >= 300 #endif // __BANG_ARCH__ >= 300
} }
/*!
* @brief Converts float32 to half data type,
* the rounding mode on MLU200 is rd, on MLU300 is rn.
*
* @param[out] dst
* Pointer to NRAM that stores half type data.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__ inline void convertFloat2half(half *dst, float *src,
int src_count) {
#if __BANG_ARCH__ >= 300
__bang_float2half_rn(dst, src, src_count);
#else
__bang_float2half_rd(dst, src, src_count);
#endif
}
#endif // COMMON_MLU_HELPER_HPP_ #endif // COMMON_MLU_HELPER_HPP_
This diff is collapsed.
This diff is collapsed.
...@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src, ...@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
int w_seg = position.w_end - position.w_start; int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c; int size = h_seg * w_seg * shape_full.c;
__memcpy(dst, __memcpy(dst, src + position.n_start * n_offset +
src + position.n_start * n_offset + position.h_start * h_offset + position.h_start * h_offset + position.w_start * w_offset,
position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T), size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1); n_seg - 1);
} }
...@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward( ...@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
int elem_count = int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c, CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T)); NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram, elem_count, (T)0); __bang_write_value(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c; int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c; int y_h_offset = shape_seg.w * shape_seg.c;
...@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward( ...@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T)); CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count = int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T)); CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram_temp, elem_count, (T)0); __bang_write_value(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c; int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c; int y_h_offset = shape_seg.w * align_c;
...@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward( ...@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
int elem_count = int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c, CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T)); NFU_ALIGN_SIZE / sizeof(T));
__nramset(dx_nram, elem_count, (T)0); __bang_write_value(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c; int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c; int dy_h_offset = shape_seg.w * dy_full.c;
...@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward( ...@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
// fill zeros to dx // fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c; T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c; int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0); __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c; int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c; int dy_h_offset_seg = shape_seg.w * align_c;
......
...@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, ...@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM); __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4 // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel); __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel); __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel); __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel); __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel); __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel); __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
...@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, ...@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
} // loop_roi_grid_w } // loop_roi_grid_w
} // loop_roi_grid_h } // loop_roi_grid_h
T count_value = (T)(1.0 / count); T count_value = (T)(1.0 / count);
__bang_mul_const(nram_out, nram_out, count_value, align_channel); __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM); __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num } // loop_cyc_num
} }
...@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg( ...@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
case CNRT_FLOAT16: { case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned, roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height, channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, input_width, sampling_ratio, (half)spatial_scale,
(half)spatial_scale, num_rois); num_rois);
}; break; }; break;
case CNRT_FLOAT32: { case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output, roialignForwardAvg((float *)input, (float *)rois, (float *)output,
...@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp( ...@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
&x_high, &y_low, &y_high); &x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) { if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM); __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c, image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c, image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c, image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align); c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align); 1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align, __bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c, image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c); (T *)buffer + c_align, c);
...@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp( ...@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
} }
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T), __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM); GDRAM2NRAM);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once, image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c); (T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once, image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c); (T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once, image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c); (T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c); align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c); 1 / count, align_c);
__bang_atomic_add( __bang_atomic_add(
(T *)buffer + align_c, (T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once, image_offset + y_high * wo * c + x_high * c + i * deal_once,
......
...@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, ...@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
} }
if (is_empty) { if (is_empty) {
__nramset((T *)nram_out, c_slice_align, (T)0); __bang_write_value((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM); c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) { if (NULL != argmax) {
__nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset, __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM); (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
} }
...@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, ...@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
for (int i = 0; i < c_slice; i++) { for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim); nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
} }
__bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1, __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align); c_slice_align);
__bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width, __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align); c_slice_align);
/*compute input_w*/ /*compute input_w*/
__bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim, __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align); c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a, __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align); c_slice_align);
__bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1, __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align); c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a, __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align); c_slice_align);
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a, convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
...@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type, ...@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
rois_num, (float)spatial_scale, (float *)output_data, rois_num, (float)spatial_scale, (float *)output_data,
argmax); argmax);
}; break; }; break;
default: { default: { break; }
break;
}
} }
} }
} // namespace forward } // namespace forward
...@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex( ...@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
align_c); align_c);
// Perform 'temp_result - hstart' operation // Perform 'temp_result - hstart' operation
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c); align_c);
// Perform 'temp_result1 - temp_result2 * width' operation // Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c); align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
(float *)nram_argmax_fp_w, align_c); (float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation // Perform 'temp_result - wstart' operation
__bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart, __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
align_c); wstart, align_c);
// Perform 'temp_result = h * w_compute + w' operation // Perform 'temp_result = h * w_compute + w' operation
__bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c); w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c); (float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) { if (loop_flag == 1) {
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c); (loop_id * true_limit), align_c);
} }
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1, convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2, (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
...@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, ...@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/ */
// Load the data from GDRAM to NRAM. // Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision, __memcpy(
(const T *)grads + (n * pooled_height * pooled_width + (T *)nram_grads + align_c * high_precision,
ph * pooled_width + pw) * (const T *)grads +
channels, (n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels * sizeof(T), GDRAM2NRAM); channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) { if (high_precision) {
__bang_half2float((float *)nram_grads, __bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision, (half *)nram_grads + align_c * high_precision,
align_c); align_c);
} }
__memcpy((int32_t *)nram_argmax, __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(const int32_t *)argmax + (n * pooled_height * pooled_width + (n * pooled_height * pooled_width +
ph * pooled_width + pw) * ph * pooled_width + pw) *
channels, channels,
channels * sizeof(int32_t), GDRAM2NRAM); channels * sizeof(int32_t), GDRAM2NRAM);
// Perform pooling operation on NRAM. // Perform pooling operation on NRAM.
...@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, ...@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/ */
// Load the data from GDRAM to NRAM. // Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision, __memcpy(
(const T *)grads + (n * pooled_height * pooled_width + (T *)nram_grads + align_c * high_precision,
ph * pooled_width + pw) * (const T *)grads +
channels, (n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels * sizeof(T), GDRAM2NRAM); channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) { if (high_precision) {
__bang_half2float((float *)nram_grads, __bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision, (half *)nram_grads + align_c * high_precision,
align_c); align_c);
} }
__memcpy((int32_t *)nram_argmax, __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(const int32_t *)argmax + (n * pooled_height * pooled_width + (n * pooled_height * pooled_width +
ph * pooled_width + pw) * ph * pooled_width + pw) *
channels, channels,
channels * sizeof(int32_t), GDRAM2NRAM); channels * sizeof(int32_t), GDRAM2NRAM);
int ping_pong = 0; int ping_pong = 0;
...@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward( ...@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
height, width, pooled_height, pooled_width, rois_num, height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision); (const float)spatial_scale, high_precision);
}; break; }; break;
default: { default: { break; }
break;
}
} }
} }
} // namespace backward } // namespace backward
......
...@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift( ...@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
int t_shift = shifts[n_index * group_size + group_id]; int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size + int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size; n_index * time_size * channel_size * hw_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0); __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;"); __asm__ volatile("sync;");
if (abs(t_shift) >= time_size) { if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
...@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence( ...@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
int next_sequence_index = int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size; index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size; int cur_sequence_index = index / hw_size / channel_size % time_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0); __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;"); __asm__ volatile("sync;");
if (max_number_hw_per_core == 0) { if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment