Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
e847cf8a
Commit
e847cf8a
authored
Oct 10, 2022
by
bdf
Committed by
Zaida Zhou
Nov 23, 2022
Browse files
[Refactor] Adapt mlu code to cntoolkit3.0.1
parent
4c6e99c8
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
967 additions
and
1092 deletions
+967
-1092
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+20
-20
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+13
-13
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+50
-27
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+241
-940
mmcv/ops/csrc/common/mlu/nms_utils.hpp
mmcv/ops/csrc/common/mlu/nms_utils.hpp
+553
-0
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+7
-7
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+39
-39
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+42
-44
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+2
-2
No files found.
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_
const
(vec_left, vec_left, (T)offset, batches_stride);
__bang_add_
scalar
(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_
const
(vec_right, vec_right, (T)offset, batches_stride);
__bang_add_
scalar
(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
// zero vector ---> bottom
__
nramset
(vec_bottom, batches_stride, 0.f);
__
bang_write_value
(vec_bottom, batches_stride, 0.f);
// width --> vec_left
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...
@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_
const
(vec_top, vec_top, (T)offset, batches_stride);
__bang_add_
scalar
(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_
const
(vec_bottom, vec_bottom, (T)offset, batches_stride);
__bang_add_
scalar
(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
// ---> vec_top;
...
@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_
const
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_
const
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
// ---> b2_x1;
...
@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height;
T *inter_s = height;
// offset vector ---> vec_b2_y1
// offset vector ---> vec_b2_y1
__
nramset
(vec_b2_y1, batches_stride, T(offset));
__
bang_write_value
(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
T *vec_offset = vec_b2_y1;
if (mode == 0) {
if (mode == 0) {
...
@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
int32_t base1 = b1 * COORD_NUM;
int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram
// set bbox1 and bbox2 to nram
__
nramset
(vec_b1_x1, batches_stride, bbox1[base1]);
__
bang_write_value
(vec_b1_x1, batches_stride, bbox1[base1]);
__
nramset
(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__
bang_write_value
(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__
nramset
(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__
bang_write_value
(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__
nramset
(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
__
bang_write_value
(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) {
for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride;
int32_t index2 = j * batches_stride;
...
@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_
const
(vec_left, vec_left, (T)offset, batches_stride);
__bang_add_
scalar
(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_
const
(vec_right, vec_right, (T)offset, batches_stride);
__bang_add_
scalar
(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
// zero vector ---> bottom
__
nramset
(vec_bottom, batches_stride, (T)0);
__
bang_write_value
(vec_bottom, batches_stride, (T)0);
// width --> vec_left
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...
@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_
const
(vec_top, vec_top, (T)offset, batches_stride);
__bang_add_
scalar
(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_
const
(vec_bottom, vec_bottom, (T)offset, batches_stride);
__bang_add_
scalar
(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
...
@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_
const
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_
const
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
...
@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
...
@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height;
T *inter_s = height;
// offset vector ---> vec_b2_y1
// offset vector ---> vec_b2_y1
__
nramset
(vec_b2_y1, batches_stride, T(offset));
__
bang_write_value
(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
T *vec_offset = vec_b2_y1;
if (mode == 0) {
if (mode == 0) {
...
...
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
...
@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
// set output_nram to zero
// set output_nram to zero
__
nramset
(output_nram, param.output_nram_size, T(0));
__
bang_write_value
(output_nram, param.output_nram_size, T(0));
// loop blocks of kernel window: grid_dim.(Kh, Kw)
// loop blocks of kernel window: grid_dim.(Kh, Kw)
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
...
@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
...
@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
T *sum = sum_array;
T *sum = sum_array;
for (int g = 0; g < blkSize.G; ++g) {
for (int g = 0; g < blkSize.G; ++g) {
__bang_mul_
const
(sum, src, mask_array[mask_index],
__bang_mul_
scalar
(sum, src, mask_array[mask_index],
param.block_Cg_NFU);
param.block_Cg_NFU);
//
//
// NOTE: Since block_Cg_NFU >= block_Cg_stride,
// NOTE: Since block_Cg_NFU >= block_Cg_stride,
// overlapped writing may occur on sum_array.
// overlapped writing may occur on sum_array.
...
@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
...
@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index;
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
GDRAM2NRAM);
GDRAM2NRAM);
__bang_mul_
const
((T *)grad_input_buff, (T *)grad_output_buff,
__bang_mul_
scalar
((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align);
((T *)mask_buff)[mask_index], num_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, num_align);
(T *)grad_input_buff, num_align);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
...
@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
...
@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index;
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
GDRAM2NRAM);
GDRAM2NRAM);
__bang_mul_
const
((T *)grad_input_buff, (T *)grad_output_buff,
__bang_mul_
scalar
((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align);
((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, rem_for_loop);
(T *)grad_input_buff, rem_for_loop);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
...
@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
...
@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
const int wi, const int c, const int k_up,
const int wi, const int c, const int k_up,
const int group, const int scale) {
const int group, const int scale) {
if (dtype == CNRT_FLOAT16) {
if (dtype == CNRT_FLOAT16) {
backward::MLUUnion1KernelCarafeBackward<half>
backward::MLUUnion1KernelCarafeBackward<half>
<<<k_dim, k_type, queue>>>(
<<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input,
input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up,
grad_mask, n, hi, wi, c, k_up,
group, scale);
group, scale);
} else {
} else {
backward::MLUUnion1KernelCarafeBackward<float>
backward::MLUUnion1KernelCarafeBackward<float>
<<<k_dim, k_type, queue>>>(
<<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input,
input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up,
grad_mask, n, hi, wi, c, k_up,
group, scale);
group, scale);
}
}
}
}
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
View file @
e847cf8a
...
@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
...
@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
// get sign bit
// get sign bit
const
float
move_23bit
=
8388608.0
;
const
float
move_23bit
=
8388608.0
;
// 0x80000000 = 1,000000000,0000000000000000000000000000
// 0x80000000 = 1,000000000,0000000000000000000000000000
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
src
,
(
char
*
)
src_addition
,
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
src
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
// get 1 or 0 from sign bit
// get 1 or 0 from sign bit
// judg is Odd
// judg is Odd
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
0x00000001
);
__bang_cycle_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
__bang_cycle_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
NFU_ALIGN_SIZE
);
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000001
);
0x80000001
);
__bang_cycle_eq
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
__bang_cycle_eq
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// minus xor, positive num invariant
// minus xor, positive num invariant
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
0xffffffff
);
__bang_cycle_mul
(
dst
,
dst_addition
,
src_addition
,
src_count
,
__bang_cycle_mul
(
dst
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_bxor
((
char
*
)
dst
,
(
char
*
)
src
,
(
char
*
)
dst
,
src_count
*
sizeof
(
float
));
__bang_bxor
((
char
*
)
dst
,
(
char
*
)
src
,
(
char
*
)
dst
,
src_count
*
sizeof
(
float
));
// convert int32 to float32
// convert int32 to float32
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x7fffff
);
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x7fffff
);
__bang_cycle_band
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
__bang_cycle_band
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x4b000000
);
0x4b000000
);
__bang_cycle_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
__bang_cycle_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__bang_sub_
const
(
dst
,
dst
,
move_23bit
,
src_count
);
__bang_sub_
scalar
(
dst
,
dst
,
move_23bit
,
src_count
);
// add one
// add one
__bang_add
(
dst
,
dst
,
dst_addition
,
src_count
);
__bang_add
(
dst
,
dst
,
dst_addition
,
src_count
);
// set sign for float32
// set sign for float32
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
0xffffffff
);
__bang_cycle_mul
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
__bang_cycle_mul
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
0x00000001
);
__bang_cycle_add
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
__bang_cycle_add
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
4
,
128
);
(
char
*
)
src_addition
,
src_count
*
4
,
128
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
4
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
4
);
...
@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
...
@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// dst_addition = abs(src)
// dst_addition = abs(src)
__bang_mul
(
dst_addition
,
src
,
(
float
*
)
dst
,
src_count
);
__bang_mul
(
dst_addition
,
src
,
(
float
*
)
dst
,
src_count
);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error.
// if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__nramset
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
1.0
f
);
__bang_write_value
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
1.0
f
);
__bang_cycle_lt
(
dst_addition
,
dst_addition
,
(
float
*
)
src_addition
,
src_count
,
__bang_cycle_lt
(
dst_addition
,
dst_addition
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_add_tz
((
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
dst_addition
,
src_count
);
__bang_add_tz
((
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
dst_addition
,
src_count
);
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xbf800000
);
0xbf800000
);
// set negative flag -1.0 = 0xbf80000
// set negative flag -1.0 = 0xbf80000
__bang_cycle_eq
(
__bang_cycle_eq
(
(
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
src_addition
,
src_count
,
(
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// to mark all src in [x<-1.0]
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// to mark all src in [x<-1.0]
__bang_active_abs
(
dst_addition
,
src
,
src_count
);
__bang_active_abs
(
dst_addition
,
src
,
src_count
);
__nramset
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
8388608.0
f
);
__bang_write_value
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
8388608.0
f
);
// mask shift move 23
// mask shift move 23
__bang_cycle_add_tz
(
__bang_cycle_add_tz
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
...
@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
...
@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// to fix max value
// to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value.
// means max value.
__bang_mul_
const
((
float
*
)
dst
,
(
float
*
)
dst
,
16777215.0
,
src_count
);
__bang_mul_
scalar
((
float
*
)
dst
,
(
float
*
)
dst
,
16777215.0
,
src_count
);
__bang_bxor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
dst
,
__bang_bxor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
dst
,
src_count
*
floatDchar
);
src_count
*
floatDchar
);
// get low 23bit
// get low 23bit
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
(
unsigned
)
0x007fffff
);
(
unsigned
)
0x007fffff
);
// mask low 23bit is 1
// mask low 23bit is 1
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
floatDchar
,
(
char
*
)
src_addition
,
src_count
*
floatDchar
,
...
@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
...
@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// set 9 high bit ===> dst
// set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__
nramset
(
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x3f800000
);
__
bang_write_value
(
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x3f800000
);
__bang_cycle_and
((
float
*
)
dst
,
(
float
*
)
dst
,
src_addition
,
src_count
,
__bang_cycle_and
((
float
*
)
dst
,
(
float
*
)
dst
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// src or dst_addition
// src or dst_addition
__bang_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
__bang_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
src_count
*
floatDchar
);
__bang_mul_
const
((
float
*
)
dst
,
(
float
*
)
dst
,
-
2.0
,
src_count
);
__bang_mul_
scalar
((
float
*
)
dst
,
(
float
*
)
dst
,
-
2.0
,
src_count
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
src_count
*
floatDchar
);
#endif // __BANG_ARCH__ >= 300
#endif // __BANG_ARCH__ >= 300
}
}
/*!
* @brief Converts float32 to half data type,
* the rounding mode on MLU200 is rd, on MLU300 is rn.
*
* @param[out] dst
* Pointer to NRAM that stores half type data.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
inline
void
convertFloat2half
(
half
*
dst
,
float
*
src
,
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_float2half_rn
(
dst
,
src
,
src_count
);
#else
__bang_float2half_rd
(
dst
,
src
,
src_count
);
#endif
}
#endif // COMMON_MLU_HELPER_HPP_
#endif // COMMON_MLU_HELPER_HPP_
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -9,14 +9,9 @@
...
@@ -9,14 +9,9 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
*************************************************************************/
#include "
common_mlu_helper
.hpp"
#include "
nms_utils
.hpp"
#define NMS_SIZE (64)
#define COORD_DIM (4)
#define COORD_DIM (4)
#define MEMORY_CORE (0x80)
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
...
@@ -24,348 +19,129 @@
...
@@ -24,348 +19,129 @@
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
__mlu_func__ void pvLock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_lock(0, 0);
}
#endif
}
__mlu_func__ void pvUnlock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_unlock(0, 0);
}
#endif
}
enum Addr { SRAM, GDRAM };
enum Addr { SRAM, GDRAM };
template <typename IN_DT, typename OUT_DT>
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection(
__mlu_func__ void nms_detection(
uint32_t *output_box_num, const int output_mode, const int input_layout,
uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
IN_DT *sram, const int core_limit, const int input_num_boxes,
const int buffer_size, IN_DT *sram, const int core_limit,
const int max_output_size, const float thresh_iou, const float thresh_score,
const int input_box_num, const int input_stride, const int output_stride,
const int keepNum, const float thresh_iou, const float thresh_score,
const float offset, const int algo) {
const float offset, const int algo) {
// global value, it is stored in sram with a offset from the begin.
// global value
const int flag_offset_size = 28;
int32_t *exit_flag = (int32_t *)(sram + 28);
int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
exit_flag[0] = 0;
loop_end_flag[0] = 0;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
const
int nms_buffer_count1 = 9;
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
// temp nram buffer to store selected target.
const
int nram_save_limit_count = 256;
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr = input_data_box;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
const IN_DT *input_y2_ptr;
input_score_ptr = input_data_score;
input_x1_ptr = input_data_box;
if (input_layout == 0) {
// [boxes_num, 4]
input_y1_ptr = input_x1_ptr + 1;
input_x2_ptr = input_x1_ptr + 2;
input_y2_ptr = input_x1_ptr + 3;
} else if (input_layout == 1) {
// [4, boxes_num]
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
}
// nram data ptr
IN_DT *x1;
IN_DT *y1;
IN_DT *x2;
IN_DT *y2;
IN_DT *score;
IN_DT *inter_x1;
IN_DT *inter_y1;
IN_DT *inter_x2;
IN_DT *inter_y2;
IN_DT *max_box; // the max score, x1, y1, x2, y2
IN_DT *x1_mask;
IN_DT *y1_mask;
IN_DT *x2_mask;
IN_DT *y2_mask;
OUT_DT *nram_save;
int limit = 0; // find limit when GDRAM or SRAM
int limit = 0; // find limit when GDRAM or SRAM
int len_core = 0; // the length deal by every core
int max_seg_pad = 0; // the max length every repeat
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int repeat = 0;
int remain = 0;
int remain = 0;
int remain_pad = 0;
int remain_pad = 0;
int input_offset = 0; // offset of input_data for current core
int input_offset = 0; // offset of input_data for current core
int nram_save_count = 0;
int nram_save_count = 0;
// mask for collect x1, y1, x2, y2. each mask has 128 elements
const int mask_size = 128;
const int total_mask_size = 512;
if (output_mode == 0) {
if (output_mode == 0) {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
total_mask_size * sizeof(IN_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
(nms_buffer_count1 * sizeof(IN_DT));
} else {
} else {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
// 5 maens: score, x1, y1, x2, y2
nram_save_limit_count * INFO_NUM
* sizeof(
OUT
_DT) -
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/
* sizeof(
IN
_DT) -
total_mask_size
* sizeof(
IN
_DT)) /
nram_save_limit_count * 5
* sizeof(
OUT
_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
(nms_buffer_count1 * sizeof(IN_DT));
}
}
if (core_limit == 1) {
int max_seg_iou_compute = 0;
len_core = input_box_num;
int repeat_iou_compute = 0;
input_offset = 0;
int remain_iou_compute = 0;
} else {
int remain_pad_iou_compute = 0;
int avg_core = input_box_num / core_limit;
int rem = input_box_num % core_limit;
len_core = avg_core + (taskId < rem ? 1 : 0);
input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
}
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
repeat = len_core / max_seg_pad;
remain = len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is half, we should convert it to float when compute the IoU
getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
int max_seg_iou_compute =
input_offset, max_seg_pad, repeat, remain,
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
remain_pad, max_seg_iou_compute, repeat_iou_compute,
int repeat_iou_compute = len_core / max_seg_iou_compute;
remain_iou_compute, remain_pad_iou_compute);
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
// initial the address point
score = buffer;
x1 = score + max_seg_pad;
y1 = x1 + max_seg_pad;
x2 = y1 + max_seg_pad;
y2 = x2 + max_seg_pad;
inter_x1 = y2 + max_seg_pad;
inter_y1 = inter_x1 + max_seg_pad;
inter_x2 = inter_y1 + max_seg_pad;
inter_y2 = inter_x2 + max_seg_pad;
x1_mask = inter_y2 + max_seg_pad;
y1_mask = x1_mask + mask_size;
x2_mask = y1_mask + mask_size;
y2_mask = x2_mask + mask_size;
max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2
// offset two line from max_box
nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
// set mask for __bang_collect instruction
// init the data ptr
if (input_layout == 0) {
IN_DT *score = (IN_DT *)nram_buffer;
__nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
IN_DT *x1 = score + max_seg_pad;
for (int idx = 0; idx < mask_size; idx++) {
IN_DT *y1 = x1 + max_seg_pad;
int index = (idx % COORD_DIM) * mask_size + idx;
IN_DT *x2 = y1 + max_seg_pad;
x1_mask[index] = (IN_DT)1.0;
IN_DT *y2 = x2 + max_seg_pad;
}
IN_DT *inter_x1 = y2 + max_seg_pad;
}
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
if (core_limit != 1) {
if (core_limit != 1) {
__sync_cluster(); // sync before current loop
__sync_cluster(); // sync before current loop
}
}
/******
find max start
******/
/******
FIND MAX START
******/
int max_index = 0; // the max score index
int max_index = 0; // the max score index
int global_max_index = 0; // for U1
int global_max_index = 0; // for U1
float max_area = 0; // the max s
c
ore area
float max_area = 0; // the max so
c
re area
max_box[0] = 0; // init 0
max_box[0] = 0; // init 0
findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
for (int i = 0; i <= repeat; i++) {
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
if (i == repeat && remain == 0) {
input_offset, repeat, remain, remain_pad, max_seg_pad,
break;
max_index);
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num that half data type could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
// seg length exceeds the max num for fp16 datatype!
return;
}
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
/******nms load start******/
mluMemcpyDirection_t load_dir = SRAM2NRAM;
if (src == SRAM) {
load_dir = SRAM2NRAM;
} else {
load_dir = GDRAM2NRAM;
}
__nramset(score, seg_len, (IN_DT)0);
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
/******nms load end******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
int stride = 1;
if (input_layout == 0) {
stride = input_stride;
} else if (input_layout == 1) {
stride = 1;
}
if (core_limit == 1) {
if (core_limit == 1) {
max_box[1] = input_x1_ptr[max_index * stride];
#if __BANG_ARCH__ >= 300
max_box[2] = input_y1_ptr[max_index * stride];
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box[3] = input_x2_ptr[max_index * stride];
max_box_x2, max_box_y2);
max_box[4] = input_y2_ptr[max_index * stride];
#else
if (algo == 0 || offset == 0.0) {
calMaxArea(max_box, algo, offset, max_area);
max_area = ((float)max_box[3] - (float)max_box[1]) *
#endif
((float)max_box[4] - (float)max_box[2]);
input_data_score[max_index] = 0;
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
input_score_ptr[max_index] = 0;
global_max_index = max_index;
global_max_index = max_index;
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
} else if (core_limit == 4) {
} else if (core_limit == 4) {
// find the max with sram
// the max box's x1, y1, x2, y2 on every core
if (coreId != MEMORY_CORE) {
max_box[1] = input_x1_ptr[max_index * stride];
max_box[2] = input_y1_ptr[max_index * stride];
max_box[3] = input_x2_ptr[max_index * stride];
max_box[4] = input_y2_ptr[max_index * stride];
}
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
for (int i = 0; i < INFO_NUM; i++) {
__memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
NRAM2SRAM);
}
// copy every core's max_index to sram, use 2 half to store max_index
__memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
sizeof(uint32_t),
NRAM2SRAM); // int32_t datatype
__sync_cluster();
__sync_cluster();
findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
// copy score from sram to nram and find the max
#if __BANG_ARCH__ >= 300
__nramset(inter_x1, NMS_SIZE, (IN_DT)0);
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
__memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
max_box_x2, max_box_y2);
__bang_max(max_box, inter_x1, NMS_SIZE);
#else
int max_core = 0;
calMaxArea(max_box, algo, offset, max_area);
if (sizeof(IN_DT) == sizeof(half)) {
#endif
max_core = ((uint16_t *)max_box)[1];
global_max_index = ((uint32_t *)(max_box + 5))[0];
} else if (sizeof(IN_DT) == sizeof(float)) {
input_data_score[global_max_index] = 0;
max_core = ((uint32_t *)max_box)[1];
}
// copy the max box from SRAM to NRAM
__memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x1
__memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y1
__memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x2
__memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y2
__memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
sizeof(uint32_t), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
input_score_ptr[global_max_index] = 0;
}
}
// by now, we get: max_score|max_index|max_box|max_area
// by now, we get: max_score|max_index|max_box|max_area
/******find max end******/
/******FIND MAX END******/
/******nms store start******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
__memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
1 * sizeof(uint32_t), 0);
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
(*output_box_num)++;
}
// store to sram/gdram
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
if (*output_box_num != 0) {
max_output_size, thresh_score, output_mode, nram_save_count,
mluMemcpyDirection_t store_dir = NRAM2GDRAM;
output_box_num);
if (dst == SRAM) {
store_dir = NRAM2SRAM;
} else { // dst == GDRAM
store_dir = NRAM2GDRAM;
}
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
if (nram_save_count != 0) {
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * sizeof(uint32_t), store_dir);
pvUnlock();
output_data += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
pvUnlock();
output_data += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
store_dir, output_stride * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_data += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
// if the max score <= 0, end
// if the max score <= 0, end
if (core_limit == 1) {
if (core_limit == 1) {
...
@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection(
...
@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection(
} else {
} else {
if (float(max_box[0]) <= thresh_score) {
if (float(max_box[0]) <= thresh_score) {
if (coreId == 0) {
if (coreId == 0) {
loop_end
_flag[0] = 1;
exit
_flag[0] = 1;
}
}
}
}
__sync_cluster();
__sync_cluster();
if (
loop_end
_flag[0] == 1) {
if (
exit
_flag[0] == 1) {
break;
break;
}
}
}
}
/******nms store end******/
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
// To solve half data accuracy, we convert half to float to calculate IoU.
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
for (int i = 0; i <= repeat_iou_compute; i++) {
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
if (i == repeat_iou_compute && remain_iou_compute == 0) {
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
break;
max_box_y1, max_box_x2, max_box_y2, nram_save,
}
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
int seg_len = 0; // the length every nms compute
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
int cpy_len = 0; // the length every nms memcpy
input_offset, offset, max_area, input_num_boxes, algo);
i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
#else
: seg_len = max_seg_iou_compute;
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
i == repeat_iou_compute ? cpy_len = remain_iou_compute
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
: cpy_len = max_seg_iou_compute;
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
max_box[2], max_box[3], max_box[4], nram_save,
/******nms load start******/
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
mluMemcpyDirection_t load_dir = SRAM2NRAM;
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
if (src == SRAM) {
input_offset, offset, max_area, input_num_boxes, algo);
load_dir = SRAM2NRAM;
#endif
} else {
} // for max_output_size
load_dir = GDRAM2NRAM;
}
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
if (input_layout == 0) {
// the following number 4 means x1, y1, x2, y2
__memcpy(
inter_x1,
input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
cpy_len * COORD_DIM * sizeof(IN_DT),
cpy_len * COORD_DIM * sizeof(IN_DT), 0);
// here use collect instruction to transpose the [n, 4] shape into [4,
// n] shape to avoid
// discrete memory accessing.
for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
// the following number 32 means 32 elements will be selected out by
// once operation
__bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x1_mask, mask_size);
__bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y1_mask, mask_size);
__bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x2_mask, mask_size);
__bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y2_mask, mask_size);
}
} else if (input_layout == 1) {
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y1 + dt_offset,
input_y1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(x2 + dt_offset,
input_x2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y2 + dt_offset,
input_y2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
}
/******nms load end******/
/******nms compute start******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******nms compute end******/
// update the score
mluMemcpyDirection_t update_dir = NRAM2SRAM;
if (dst == SRAM) {
update_dir = NRAM2SRAM;
} else {
update_dir = NRAM2GDRAM;
}
if (sizeof(IN_DT) == sizeof(half)) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for keepNum
}
}
__mlu_global__ void MLUUnion1KernelNMS(
__mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence,
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_stride,
const int input_num_boxes, const int max_output_size,
const int max_output_size, const float iou_threshold,
const float iou_threshold, const float confidence_threshold,
const float confidence_threshold, const int mode, const int input_layout,
const int output_mode, void *workspace, void *result_num, void *output,
void *workspace, void *result_num, void *output,
const cnrtDataType_t data_type_input, const float offset, const int algo) {
const cnrtDataType_t data_type_input, const float offset, const int algo) {
if (data_type_input == CNRT_FLOAT16) {
if (data_type_input == CNRT_FLOAT16) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
...
@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS(
...
@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS(
} else {
} else {
}
}
int output_stride = max_output_size;
uint32_t output_box_num = 0;
uint32_t result_box_num = 0;
float *score_data = (float *)workspace;
if (mode == 0) {
float *boxes_data = (float *)input_boxes;
uint32_t *out_data = (uint32_t *)output;
float *sram = (float *)sram_buffer;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *boxes_data = (half *)input_boxes;
half *confi_data = (half *)workspace;
half *buffer = (half *)nram_buffer;
half *sram = (half *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
if (output_mode == 0) {
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
if (data_type_input == CNRT_FLOAT32) {
sram, taskDim, input_num_boxes, input_stride,
nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
output_stride, max_output_size, iou_threshold,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
confidence_threshold, offset, algo);
max_output_size, iou_threshold, confidence_threshold,
((uint32_t *)result_num)[0] = result_box_num;
offset, algo);
}; break;
} else {
nms_detection(output_box_num, output_mode, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
}
} else {
} else {
switch (data_type_input) {
if (data_type_input == CNRT_FLOAT32) {
default: { return; }
nms_detection(output_box_num, output_mode, (float *)output, score_data,
case CNRT_FLOAT16: {
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
half *boxes_data = (half *)input_boxes;
max_output_size, iou_threshold, confidence_threshold,
half *confi_data = (half *)workspace;
offset, algo);
half *out_data = (half *)output;
} else {
half *buffer = (half *)nram_buffer;
nms_detection(output_box_num, output_mode, (half *)output,
half *sram = (half *)sram_buffer;
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confidence_threshold, offset, algo);
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *out_data = (float *)output;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
}
}
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
}
template <typename IN_DT, typename OUT_DT>
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux(
__mlu_func__ void nms_detection_ux(
int32_t *
loop_end
_flag, uint32_t &output_box_num, OUT_DT *output_dram,
int32_t *
exit
_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_layout, const int input_num_boxes, const int input_stride,
const int input_num_boxes, const int max_output_size,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float thresh_iou, const float thresh_score, const float offset,
const float offset, const int output_mode, const int algo) {
const int output_mode, const int algo) {
loop_end_flag[0] = 0;
exit_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer;
IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
...
@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux(
...
@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux(
float div_thresh_iou = 1.0 / thresh_iou;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr = boxes_data;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
const IN_DT *input_y2_ptr;
input_score_ptr = score_data;
input_x1_ptr = boxes_data;
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
int limit = 0; // find limit when GDRAM or SRAM
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int max_seg_pad = 0; // the max length every repeat
...
@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux(
...
@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux(
(nms_buffer_count1 * sizeof(IN_DT));
(nms_buffer_count1 * sizeof(IN_DT));
}
}
// data split
int input_offset = 0;
int avg_cluster = input_num_boxes / clusterDim;
int max_seg_iou_compute = 0;
int rem_cluster = input_num_boxes % clusterDim;
int repeat_iou_compute = 0;
int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
int remain_iou_compute = 0;
int cluster_offset = avg_cluster * clusterId +
int remain_pad_iou_compute = 0;
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
int input_offset = cluster_offset + core_offset;
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_avg_core = input_num_boxes / clusterDim;
int max_index_rem_core = input_num_boxes % clusterDim;
int max_index_len_core =
max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
int max_index_input_offset =
max_index_avg_core * clusterId +
(clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
max_seg_pad, repeat, remain, remain_pad,
max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the nram ptr
// init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *x1 = score + max_seg_pad;
...
@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux(
...
@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux(
OUT_DT *nram_save =
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
NFU_ALIGN_SIZE); // offset two line from max_box
#if __BANG_ARCH__ >= 300
mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
float max_box_x1 = 0;
mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
float max_box_y1 = 0;
input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
float max_box_x2 = 0;
input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
keep++) { // loop until the max_score <= 0
__sync_all();
__sync_all();
/******FIND MAX START******/
int max_index = 0;
int max_index = 0;
int global_max_index = 0; // for Ux
int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
max_box[0] = 0; // init 0
if (coreId == 0) {
if (coreId == 0) {
for (int i = 0; i <= repeat; i++) {
findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
if (i == repeat && remain == 0) {
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
break;
input_offset, repeat, remain, remain_pad, max_seg_pad,
}
max_index);
int seg_len = (i == repeat)
? remain_pad
: max_seg_pad; // the length every nms compute
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num
// that fp16 could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
return;
}
int cpy_len = (i == repeat)
? remain
: max_seg_pad; // the length every nms memcpy
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score,
input_score_ptr + max_index_input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index =
((uint16_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index =
((uint32_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every cluster
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
// copy max box info to sram
// copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
}
__sync_all();
__sync_all();
// copy all partial max to the sram of cluster 0
#if __BANG_ARCH__ <= 372
if (clusterId != 0) {
findGlobalMaxBox(max_box, sram, inter_x1);
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
#endif
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
// copy the global max box to max_box
#if __BANG_ARCH__ >= 300
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
if (algo == 0 || offset == 0.0) {
max_box_x2, max_box_y2);
max_area = ((float)max_box[3] - (float)max_box[1]) *
#else
((float)max_box[4] - (float)max_box[2]);
calMaxArea(max_box, algo, offset, max_area);
} else {
#endif
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + 5))[0];
global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId !=
0x80
) {
if (coreId !=
MEMORY_CORE
) {
input_
score_
ptr
[global_max_index] = 0;
score_
data
[global_max_index] = 0;
}
}
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
/******NMS STORE START******/
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
// store to nram
max_output_size, thresh_score, output_mode, nram_save_count,
if (float(max_box[0]) > thresh_score) {
output_box_num);
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
if (float(max_box[0]) <= thresh_score) {
if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) {
if (clusterId == 0 && coreId == 0) {
loop_end
_flag[0] = 1; // dram
exit
_flag[0] = 1; // dram
}
}
}
}
__sync_all();
__sync_all();
if (
loop_end
_flag[0] == 1) {
if (
exit
_flag[0] == 1) {
break;
break;
}
}
/******NMS STORE END******/
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
// To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
for (int i = 0; i <= repeat_iou_compute; i++) {
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
if (i == repeat_iou_compute && remain_iou_compute == 0) {
inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
break;
max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
}
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
: max_seg_iou_compute;
max_area, input_num_boxes, algo);
int cpy_len =
#else
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
/******NMS LOAD START******/
inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
__nramset((float *)score, seg_len, 0.0f);
max_box[3], max_box[4], nram_save, repeat_iou_compute,
int dt_offset = 0;
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
if (sizeof(IN_DT) == sizeof(float)) {
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
max_area, input_num_boxes, algo);
cpy_len * sizeof(IN_DT), input_load_dir,
#endif
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
} // for max_output_size
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
/******NMS LOAD END******/
/******NMS COMPUTE START******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
if (sizeof(IN_DT) == 2) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), input_store_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for max_output_size
}
}
__mlu_global__ void MLUUionXKernelNMS(
__mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence,
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int
input_layout, const int in
put_s
trid
e,
const int input_num_boxes, const int
max_out
put_s
iz
e,
const
int max_output_size
, const float
iou
_threshold,
const
float iou_threshold
, const float
confidence
_threshold,
const float
confidence_threshold, const float offse
t,
const float
offset, const cnrtDataType_t data_type_inpu
t,
const
cnrtDataType_t data_type_input, const int output_mode, const int algo
,
const
int output_mode, const int algo, void *workspace, void *result_num
,
void
*workspace, void *result_num, void
*output) {
void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *loop_end_flag =
int32_t *exit_flag = (int32_t *)((char *)workspace +
(int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
INFO_NUM * input_num_boxes * input_dwidth);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
...
@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS(
...
@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS(
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
}
}
__sync_cluster();
__sync_cluster();
uint32_t output_box_num = 0;
uint32_t output_box_num = 0;
float *score_data;
float *boxes_data;
score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
if (output_mode == 0) {
if (output_mode == 0) {
uint32_t *output_dram = (uint32_t *)output;
if (data_type_input == CNRT_FLOAT32) {
switch (data_type_input) {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
default: { return; }
score_data, boxes_data, input_ram, input_num_boxes,
case CNRT_FLOAT16: {
max_output_size, iou_threshold, confidence_threshold,
half *score_data;
offset, output_mode, algo);
half *boxes_data;
} else {
score_data =
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
(half *)score_data, (half *)boxes_data, input_ram,
boxes_data =
input_num_boxes, max_output_size, iou_threshold,
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
confidence_threshold, offset, output_mode, algo);
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
}
}
} else {
} else {
switch (data_type_input) {
if (data_type_input == CNRT_FLOAT32) {
default: { return; }
nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
case CNRT_FLOAT16: {
boxes_data, input_ram, input_num_boxes, max_output_size,
half *output_dram = (half *)output;
iou_threshold, confidence_threshold, offset, output_mode,
half *score_data;
algo);
half *boxes_data;
} else {
score_data =
nms_detection_ux(exit_flag, output_box_num, (half *)output,
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
(half *)score_data, (half *)boxes_data, input_ram,
boxes_data =
input_num_boxes, max_output_size, iou_threshold,
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
confidence_threshold, offset, output_mode, algo);
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *output_dram = (float *)output;
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
}
}
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
}
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const void *scores_ptr, const int input_num_boxes,
const int
input_stride, const int max_output_boxes
,
const int
max_output_boxes, const float iou_threshold
,
const float
iou_threshold, const float offset
,
const float
offset, void *workspace_ptr, void *output_size_ptr
,
void
*workspace_ptr, void *output_size_ptr, void
*output_ptr) {
void *output_ptr) {
switch (k_type) {
switch (k_type) {
default: { return; }
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes,
input_stride,
(void *)
boxes_ptr,
(void *)
scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0,
/*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
/*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
data_type_input, offset, /*algo=*/1);
}; break;
}; break;
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION2:
...
@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
...
@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
input_stride, max_output_boxes, iou_threshold,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
/*confidence_threshold=*/0.0, offset, data_type_input,
data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
/*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
output_size_ptr, output_ptr);
output_ptr);
}; break;
}; break;
}
}
}
}
mmcv/ops/csrc/common/mlu/nms_utils.hpp
0 → 100644
View file @
e847cf8a
/*************************************************************************
* Copyright (C) [2019-2022] by Cambricon, Inc.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef NMS_UTILS_HPP_
#define NMS_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define NMS_SIZE (64)
#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define NMS_DOWN(x, y) (x / y) * y
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define MEMORY_CORE (0x80)
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
__mlu_func__
void
pvLock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_lock
(
0
,
0
);
}
#endif
}
__mlu_func__
void
pvUnlock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_unlock
(
0
,
0
);
}
#endif
}
template
<
typename
T
>
static
__mlu_func__
void
computeReluN
(
T
*
nram_dst
,
T
*
nram_src
,
void
*
nram_tmp
,
const
int
deal_num
,
const
T
threshold
=
0
)
{
if
(
threshold
<
0
)
{
return
;
}
if
(
threshold
)
{
#if __BANG_ARCH__ >= 300
__bang_relun
(
nram_dst
,
nram_src
,
deal_num
,
threshold
);
#else
int
align_num
=
NFU_ALIGN_SIZE
/
sizeof
(
T
);
T
*
nram_aux_a
=
(
T
*
)
nram_tmp
;
T
*
nram_aux_b
=
nram_aux_a
+
deal_num
;
T
*
nram_zero
=
nram_aux_b
+
align_num
;
__bang_write_value
(
nram_aux_b
,
align_num
,
threshold
);
__bang_write_zero
(
nram_zero
,
align_num
);
__bang_cycle_lt
((
T
*
)
nram_aux_a
,
nram_src
,
(
T
*
)
nram_aux_b
,
deal_num
,
align_num
);
__bang_mul
(
nram_dst
,
nram_src
,
(
T
*
)
nram_aux_a
,
deal_num
);
__bang_cycle_eq
((
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_a
,
(
T
*
)
nram_zero
,
deal_num
,
align_num
);
__bang_cycle_mul
((
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_b
,
deal_num
,
align_num
);
__bang_add
(
nram_dst
,
nram_dst
,
(
T
*
)
nram_aux_a
,
deal_num
);
__bang_cycle_gt
((
T
*
)
nram_aux_a
,
nram_dst
,
(
T
*
)
nram_zero
,
deal_num
,
align_num
);
__bang_mul
(
nram_dst
,
nram_dst
,
(
T
*
)
nram_aux_a
,
deal_num
);
#endif
}
else
{
#if __BANG_ARCH__ >= 300
__bang_relu
(
nram_dst
,
nram_src
,
deal_num
);
#else
__bang_active_relu
(
nram_dst
,
nram_src
,
deal_num
);
#endif
}
}
__mlu_func__
void
getComputeParamsBlockOrU1
(
const
int
input_dwidth
,
const
int
input_box_num
,
const
int
limit
,
const
int
core_limit
,
int
&
input_offset
,
int
&
max_seg_pad
,
int
&
repeat
,
int
&
remain
,
int
&
remain_pad
,
int
&
max_seg_iou_compute
,
int
&
repeat_iou_compute
,
int
&
remain_iou_compute
,
int
&
remain_pad_iou_compute
)
{
int
avg_core
=
input_box_num
/
core_limit
;
int
rem
=
input_box_num
%
core_limit
;
int
len_core
=
avg_core
+
(
coreId
<
rem
?
1
:
0
);
input_offset
=
avg_core
*
coreId
+
(
coreId
<=
rem
?
coreId
:
rem
);
max_seg_pad
=
NMS_DOWN
(
limit
,
NMS_SIZE
);
repeat
=
len_core
/
max_seg_pad
;
remain
=
len_core
%
max_seg_pad
;
remain_pad
=
NMS_UP
(
remain
,
NMS_SIZE
);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute
=
NMS_DOWN
(
max_seg_pad
/
(
4
/
input_dwidth
),
NMS_SIZE
);
repeat_iou_compute
=
len_core
/
max_seg_iou_compute
;
remain_iou_compute
=
len_core
%
max_seg_iou_compute
;
remain_pad_iou_compute
=
NMS_UP
(
remain_iou_compute
,
NMS_SIZE
);
}
__mlu_func__
void
getComputeParamsUx
(
const
int
input_dwidth
,
const
int
input_num_boxes
,
const
int
limit
,
int
&
input_offset
,
int
&
max_seg_pad
,
int
&
repeat
,
int
&
remain
,
int
&
remain_pad
,
int
&
max_seg_iou_compute
,
int
&
repeat_iou_compute
,
int
&
remain_iou_compute
,
int
&
remain_pad_iou_compute
)
{
// data split
int
avg_cluster
=
input_num_boxes
/
clusterDim
;
int
rem_cluster
=
input_num_boxes
%
clusterDim
;
int
len_cluster
=
avg_cluster
+
(
clusterId
<
rem_cluster
);
int
cluster_offset
=
avg_cluster
*
clusterId
+
(
clusterId
<=
rem_cluster
?
clusterId
:
rem_cluster
);
int
avg_core
=
len_cluster
/
coreDim
;
int
rem_core
=
len_cluster
%
coreDim
;
int
len_core
=
avg_core
+
(
coreId
<
rem_core
);
int
core_offset
=
avg_core
*
coreId
+
(
coreId
<=
rem_core
?
coreId
:
rem_core
);
input_offset
=
cluster_offset
+
core_offset
;
max_seg_pad
=
NMS_DOWN
(
limit
,
NMS_SIZE
);
// core 0 of each cluster calculate the max score index
int
max_index_len_core
=
avg_cluster
+
(
clusterId
<
rem_cluster
);
repeat
=
max_index_len_core
/
max_seg_pad
;
remain
=
max_index_len_core
%
max_seg_pad
;
remain_pad
=
NMS_UP
(
remain
,
NMS_SIZE
);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute
=
NMS_DOWN
(
max_seg_pad
/
(
sizeof
(
float
)
/
input_dwidth
),
NMS_SIZE
);
repeat_iou_compute
=
len_core
/
max_seg_iou_compute
;
remain_iou_compute
=
len_core
%
max_seg_iou_compute
;
remain_pad_iou_compute
=
NMS_UP
(
remain_iou_compute
,
NMS_SIZE
);
}
template
<
typename
IN_DT
>
__mlu_func__
void
findGlobalMaxBox
(
IN_DT
*
max_box
,
IN_DT
*
sram
,
IN_DT
*
inter_x1
)
{
// copy all partial max to the sram of cluster 0
if
(
clusterId
!=
0
)
{
__memcpy
(
sram
+
REDUCE_NUM
*
clusterId
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2SRAM
,
0
);
}
__sync_all
();
// reduce between clusters to get the global max box
if
(
clusterId
==
0
)
{
if
(
coreId
==
0
)
{
__bang_write_zero
(
inter_x1
,
NMS_SIZE
);
__memcpy
(
inter_x1
,
sram
,
sizeof
(
IN_DT
),
SRAM2NRAM
,
sizeof
(
IN_DT
),
REDUCE_NUM
*
sizeof
(
IN_DT
),
clusterDim
-
1
);
__bang_max
(
max_box
,
inter_x1
,
NMS_SIZE
);
int
max_cluster
=
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
?
((
uint16_t
*
)
max_box
)[
1
]
:
((
uint32_t
*
)
max_box
)[
1
];
__memcpy
(
max_box
,
sram
+
max_cluster
*
REDUCE_NUM
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
__memcpy
(
sram
,
max_box
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
NRAM2SRAM
);
}
__sync_cluster
();
if
(
coreId
==
0x80
&&
clusterDim
>
1
)
{
// broadcast global max box to each cluster's sram
for
(
int
cluster_idx
=
1
;
cluster_idx
<
clusterDim
;
++
cluster_idx
)
{
__memcpy
(
sram
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2SRAM
,
cluster_idx
);
}
}
__sync_cluster
();
}
__sync_all
();
// copy the global max box to max_box
__memcpy
(
max_box
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
}
template
<
typename
IN_DT
>
__mlu_func__
void
findCoreMaxBox
(
IN_DT
*
input_score_ptr
,
IN_DT
*
score
,
IN_DT
*
inter_x1
,
IN_DT
*
max_box
,
const
IN_DT
*
input_x1_ptr
,
const
IN_DT
*
input_y1_ptr
,
const
IN_DT
*
input_x2_ptr
,
const
IN_DT
*
input_y2_ptr
,
const
mluMemcpyDirection_t
load_dir
,
const
int
input_offset
,
const
int
repeat
,
const
int
remain
,
const
int
remain_pad
,
const
int
max_seg_pad
,
int
&
max_index
)
{
if
(
coreId
!=
0x80
)
{
for
(
int
i
=
0
;
i
<=
repeat
;
i
++
)
{
if
(
i
==
repeat
&&
remain
==
0
)
{
break
;
}
int
seg_len
=
0
;
// the length every nms compute
int
cpy_len
=
0
;
// the length every nms memcpy
i
==
repeat
?
seg_len
=
remain_pad
:
seg_len
=
max_seg_pad
;
i
==
repeat
?
cpy_len
=
remain
:
cpy_len
=
max_seg_pad
;
/******NMS LOAD START******/
__bang_write_zero
(
score
,
seg_len
);
__memcpy
(
score
,
input_score_ptr
+
input_offset
+
i
*
max_seg_pad
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
/******NMS LOAD END******/
__bang_max
(
inter_x1
,
score
,
seg_len
);
if
(
inter_x1
[
0
]
>
max_box
[
0
])
{
max_box
[
0
]
=
inter_x1
[
0
];
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
max_index
=
((
uint16_t
*
)
inter_x1
)[
1
]
+
input_offset
+
i
*
max_seg_pad
;
// offset start from head of input_data
}
else
if
(
sizeof
(
IN_DT
)
==
sizeof
(
float
))
{
max_index
=
((
uint32_t
*
)
inter_x1
)[
1
]
+
input_offset
+
i
*
max_seg_pad
;
// offset start from head of input_data
}
}
}
// for repeat
// the max box's x1, y1, x2, y2 on every core
max_box
[
1
]
=
input_x1_ptr
[
max_index
];
max_box
[
2
]
=
input_y1_ptr
[
max_index
];
max_box
[
3
]
=
input_x2_ptr
[
max_index
];
max_box
[
4
]
=
input_y2_ptr
[
max_index
];
((
uint32_t
*
)(
max_box
+
5
))[
0
]
=
max_index
;
}
}
template
<
typename
IN_DT
>
__mlu_func__
void
findClusterMaxBox
(
IN_DT
*
sram
,
IN_DT
*
max_box
,
IN_DT
*
inter_x1
,
IN_DT
*
input_data_score
,
const
int
core_limit
)
{
// find the max with sram
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
__memcpy
(
sram
+
REDUCE_NUM
*
coreId
,
max_box
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
NRAM2SRAM
);
// int32_t datatype
__sync_cluster
();
// copy score from sram to nram and find the max
__bang_write_zero
(
inter_x1
,
64
);
__memcpy
(
inter_x1
,
sram
,
sizeof
(
IN_DT
),
SRAM2NRAM
,
sizeof
(
IN_DT
),
REDUCE_NUM
*
sizeof
(
IN_DT
),
coreDim
-
1
);
__bang_max
(
max_box
,
inter_x1
,
64
);
int
max_core
=
sizeof
(
IN_DT
)
==
sizeof
(
half
)
?
((
uint16_t
*
)
max_box
)[
1
]
:
((
uint32_t
*
)
max_box
)[
1
];
// copy the max box to max_box
__memcpy
(
max_box
,
sram
+
max_core
*
REDUCE_NUM
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
}
/*****************************************************************************/
/*******************************CALCULATE MAX AREA****************************/
/*****************************************************************************/
template
<
typename
IN_DT
>
__mlu_func__
void
calMaxArea
(
IN_DT
*
max_box
,
const
int
algo
,
float
offset
,
float
&
max_area
)
{
if
(
algo
==
0
||
offset
==
0.0
)
{
max_area
=
((
float
)
max_box
[
3
]
-
(
float
)
max_box
[
1
])
*
((
float
)
max_box
[
4
]
-
(
float
)
max_box
[
2
]);
}
else
{
max_area
=
((
float
)
max_box
[
3
]
-
(
float
)
max_box
[
1
]
+
offset
)
*
((
float
)
max_box
[
4
]
-
(
float
)
max_box
[
2
]
+
offset
);
}
}
template
<
typename
IN_DT
>
__mlu_func__
void
calMaxArea
(
IN_DT
*
max_box
,
const
int
algo
,
float
offset
,
float
&
max_area
,
float
&
max_box_x1
,
float
&
max_box_y1
,
float
&
max_box_x2
,
float
&
max_box_y2
)
{
// the case of random inf will break the requirement of x1<=x2, y1<=y2
// so exchange it if it happens.
max_box_x1
=
float
(
max_box
[
1
]);
max_box_x2
=
float
(
max_box
[
3
]);
if
(
max_box
[
1
]
>
max_box
[
3
])
{
max_box_x1
=
float
(
max_box
[
3
]);
max_box_x2
=
float
(
max_box
[
1
]);
}
max_box_y1
=
float
(
max_box
[
2
]);
max_box_y2
=
float
(
max_box
[
4
]);
if
(
max_box
[
2
]
>
max_box
[
4
])
{
max_box_y1
=
float
(
max_box
[
4
]);
max_box_y2
=
float
(
max_box
[
2
]);
}
if
(
algo
==
0
||
offset
==
0.0
)
{
max_area
=
(
max_box_x2
-
max_box_x1
)
*
(
max_box_y2
-
max_box_y1
);
}
else
{
max_area
=
(
max_box_x2
-
max_box_x1
+
offset
)
*
(
max_box_y2
-
max_box_y1
+
offset
);
}
}
/***********************************************************************/
/*******************************STORE RESULT****************************/
/***********************************************************************/
template
<
typename
IN_DT
,
typename
OUT_DT
>
__mlu_func__
void
storeResult
(
IN_DT
*
max_box
,
OUT_DT
*
nram_save
,
OUT_DT
*&
output_dram
,
const
int
keep
,
const
int
nram_save_limit_count
,
const
int
max_output_size
,
const
float
thresh_score
,
const
int
output_mode
,
int
&
nram_save_count
,
uint32_t
&
output_box_num
)
{
/******NMS STORE START******/
// store to nram
if
(
float
(
max_box
[
0
])
>
thresh_score
)
{
OUT_DT
*
save_ptr
;
int
save_offset
=
0
;
int
save_str_num
=
0
;
save_ptr
=
nram_save
;
save_offset
=
nram_save_count
;
save_str_num
=
nram_save_limit_count
;
if
(
clusterId
==
0
&&
coreId
==
0
)
{
if
(
output_mode
==
0
)
{
// index1, index2, ...
save_ptr
[
save_offset
]
=
((
uint32_t
*
)(
max_box
+
INFO_NUM
))[
0
];
}
else
if
(
output_mode
==
1
)
{
// score, x1, y1, x2, y2
__memcpy
(
save_ptr
+
save_offset
*
INFO_NUM
,
max_box
,
INFO_NUM
*
sizeof
(
IN_DT
),
NRAM2NRAM
,
INFO_NUM
*
sizeof
(
IN_DT
),
INFO_NUM
*
sizeof
(
IN_DT
),
0
);
}
else
if
(
output_mode
==
2
)
{
// score---, x1---, y1---, x2---, y2---
__memcpy
(
save_ptr
+
save_offset
,
max_box
,
1
*
sizeof
(
IN_DT
),
NRAM2NRAM
,
save_str_num
*
sizeof
(
IN_DT
),
1
*
sizeof
(
IN_DT
),
4
);
}
}
nram_save_count
++
;
output_box_num
++
;
}
// store to sram/gdram
if
(
output_box_num
!=
0
)
{
if
((
nram_save_count
==
nram_save_limit_count
)
||
(
float
(
max_box
[
0
])
<=
thresh_score
)
||
keep
==
max_output_size
-
1
)
{
if
(
nram_save_count
!=
0
)
{
if
(
clusterId
==
0
&&
coreId
==
0
)
{
if
(
output_mode
==
0
)
{
// index1, index2, ...
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
sizeof
(
uint32_t
),
NRAM2GDRAM
);
pvUnlock
();
output_dram
+=
nram_save_count
;
}
else
if
(
output_mode
==
1
)
{
// score, x1, y1, x2, y2
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
INFO_NUM
*
sizeof
(
IN_DT
),
NRAM2GDRAM
);
pvUnlock
();
output_dram
+=
nram_save_count
*
INFO_NUM
;
}
else
if
(
output_mode
==
2
)
{
// score---, x1---, y1---, x2---, y2---
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
sizeof
(
IN_DT
),
NRAM2GDRAM
,
max_output_size
*
sizeof
(
IN_DT
),
nram_save_limit_count
*
sizeof
(
IN_DT
),
4
);
pvUnlock
();
output_dram
+=
nram_save_count
;
}
nram_save_count
=
0
;
}
}
}
// if move data nram->sram/gdram
}
// if dst
}
template
<
typename
IN_DT
,
typename
OUT_DT
>
__mlu_func__
void
scoreUpdate
(
IN_DT
*
input_score_ptr
,
const
mluMemcpyDirection_t
load_dir
,
const
mluMemcpyDirection_t
store_dir
,
const
IN_DT
*
input_x1_ptr
,
const
IN_DT
*
input_y1_ptr
,
const
IN_DT
*
input_x2_ptr
,
const
IN_DT
*
input_y2_ptr
,
IN_DT
*
x1
,
IN_DT
*
y1
,
IN_DT
*
x2
,
IN_DT
*
y2
,
IN_DT
*
score
,
IN_DT
*
inter_x1
,
IN_DT
*
inter_y1
,
IN_DT
*
inter_x2
,
IN_DT
*
inter_y2
,
IN_DT
*
max_box
,
const
float
max_box_x1
,
const
float
max_box_y1
,
const
float
max_box_x2
,
const
float
max_box_y2
,
OUT_DT
*
nram_save
,
int
repeat_iou_compute
,
int
remain_iou_compute
,
int
remain_pad_iou_compute
,
int
max_seg_iou_compute
,
int
max_seg_pad
,
const
float
thresh_iou
,
const
float
div_thresh_iou
,
const
int
input_offset
,
const
float
offset
,
const
float
max_area
,
const
int
input_num_boxes
,
const
int
algo
)
{
for
(
int
i
=
0
;
i
<=
repeat_iou_compute
;
i
++
)
{
if
(
i
==
repeat_iou_compute
&&
remain_iou_compute
==
0
)
{
break
;
}
int
seg_len
=
(
i
==
repeat_iou_compute
)
?
remain_pad_iou_compute
:
max_seg_iou_compute
;
int
cpy_len
=
(
i
==
repeat_iou_compute
)
?
remain_iou_compute
:
max_seg_iou_compute
;
/******NMS LOAD START******/
int
dt_offset
=
0
;
if
(
sizeof
(
IN_DT
)
==
sizeof
(
float
))
{
__memcpy
(
score
,
input_score_ptr
+
input_offset
+
i
*
max_seg_pad
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
dt_offset
=
0
;
}
else
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__memcpy
(
x1
,
input_score_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
__bang_half2float
((
float
*
)
score
,
(
half
*
)
x1
,
seg_len
);
dt_offset
=
max_seg_iou_compute
;
}
#if __BANG_ARCH__ >= 300
__memcpy
(
inter_x1
+
dt_offset
,
input_x1_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
max_seg_pad
*
sizeof
(
IN_DT
),
input_num_boxes
*
sizeof
(
IN_DT
),
3
);
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__bang_half2float
((
float
*
)
inter_x1
,
(
half
*
)
inter_x1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_y1
,
(
half
*
)
inter_y1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_x2
,
(
half
*
)
inter_x2
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_y2
,
(
half
*
)
inter_y2
+
max_seg_iou_compute
,
seg_len
);
}
// box transfer
__bang_minequal
((
float
*
)
x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
seg_len
);
__bang_maxequal
((
float
*
)
x2
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
seg_len
);
__bang_minequal
((
float
*
)
y1
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
__bang_maxequal
((
float
*
)
y2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// 1、 compute IOU
// get the area_I
__bang_maxeq_scalar
((
float
*
)
inter_x1
,
(
float
*
)
x1
,
max_box_x1
,
seg_len
);
// inter_x1
__bang_mineq_scalar
((
float
*
)
inter_x2
,
(
float
*
)
x2
,
max_box_x2
,
seg_len
);
// inter_x2
__bang_sub
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
NULL
,
seg_len
);
// inter_w
__bang_maxeq_scalar
((
float
*
)
inter_y1
,
(
float
*
)
y1
,
float
(
max_box_y1
),
seg_len
);
// inter_y1
__bang_mineq_scalar
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
float
(
max_box_y2
),
seg_len
);
// inter_y2
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
(
float
*
)
inter_y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
NULL
,
seg_len
);
// inter_h
__bang_mul
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_y1
,
seg_len
);
// area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_fusion
(
FUSION_FSA
,
(
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
offset
,
seg_len
,
seg_len
);
__bang_fusion
(
FUSION_FSA
,
(
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
offset
,
seg_len
,
seg_len
);
__bang_mul
((
float
*
)
inter_x2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// area
}
else
{
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
seg_len
);
__bang_fusion
(
FUSION_FSM
,
(
float
*
)
inter_x2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
(
float
*
)
inter_y1
,
seg_len
,
seg_len
);
}
// get the area_U: area + max_area - area_I
__bang_fusion
(
FUSION_FAS
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
max_area
,
(
float
*
)
inter_x1
,
seg_len
,
seg_len
);
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if
(
thresh_iou
>
0.0
)
{
__bang_mul_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
div_thresh_iou
,
seg_len
);
}
else
{
__bang_mul_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
thresh_iou
,
seg_len
);
}
// process for nan
__bang_lt
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_not
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_mul
((
float
*
)
score
,
(
float
*
)
score
,
(
float
*
)
inter_x1
,
seg_len
);
/******NMS COMPUTE END******/
#else
__memcpy
(
x1
+
dt_offset
,
input_x1_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
max_seg_pad
*
sizeof
(
IN_DT
),
input_num_boxes
*
sizeof
(
IN_DT
),
3
);
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__bang_half2float
((
float
*
)
x1
,
(
half
*
)
x1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
y1
,
(
half
*
)
y1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
x2
,
(
half
*
)
x2
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
y2
,
(
half
*
)
y2
+
max_seg_iou_compute
,
seg_len
);
}
// 1、 compute IOU
// get the area_I
__bang_write_value
((
float
*
)
inter_y1
,
seg_len
,
float
(
max_box
[
1
]));
// max_x1
__bang_maxequal
((
float
*
)
inter_x1
,
(
float
*
)
x1
,
(
float
*
)
inter_y1
,
seg_len
);
// inter_x1
__bang_write_value
((
float
*
)
inter_y2
,
seg_len
,
float
(
max_box
[
3
]));
// max_x2
__bang_minequal
((
float
*
)
inter_x2
,
(
float
*
)
x2
,
(
float
*
)
inter_y2
,
seg_len
);
// inter_x2
__bang_sub
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
NULL
,
seg_len
);
// inter_w
__bang_write_value
((
float
*
)
inter_x2
,
seg_len
,
float
(
max_box
[
2
]));
// max_y1
__bang_maxequal
((
float
*
)
inter_y1
,
(
float
*
)
y1
,
(
float
*
)
inter_x2
,
seg_len
);
// inter_y1
__bang_write_value
((
float
*
)
inter_x2
,
seg_len
,
float
(
max_box
[
4
]));
// max_y2
__bang_minequal
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
inter_x2
,
seg_len
);
// inter_y2
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
(
float
*
)
inter_y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
NULL
,
seg_len
);
// inter_h
__bang_mul
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_y1
,
seg_len
);
// area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
seg_len
);
__bang_sub
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
__bang_add_scalar
((
float
*
)
inter_y2
,
(
float
*
)
inter_y2
,
offset
,
seg_len
);
}
__bang_mul
((
float
*
)
inter_x2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// area
// get the area_U: area + max_area - area_I
__bang_add_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
float
(
max_area
),
seg_len
);
__bang_sub
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
// area_U
// 2、 select the box
// if IOU greater than thresh, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if
(
thresh_iou
>
0.0
)
{
__bang_mul_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
div_thresh_iou
,
seg_len
);
}
else
{
__bang_mul_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
thresh_iou
,
seg_len
);
}
__bang_ge
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_mul
((
float
*
)
score
,
(
float
*
)
score
,
(
float
*
)
inter_x1
,
seg_len
);
/******NMS COMPUTE END******/
#endif
// update the score
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
convertFloat2half
((
half
*
)
score
,
(
float
*
)
score
,
seg_len
);
}
pvLock
();
__memcpy
(
input_score_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
score
,
cpy_len
*
sizeof
(
IN_DT
),
store_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
pvUnlock
();
}
}
#endif // NMS_UTILS_HPP_
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
...
@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
int w_seg = position.w_end - position.w_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst,
__memcpy(dst, src + position.n_start * n_offset +
src + position.n_start * n_offset + position.h_start * h_offset +
position.h_start * h_offset + position.w_start * w_offset,
position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1);
n_seg - 1);
}
}
...
@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
...
@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
int elem_count =
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T));
NFU_ALIGN_SIZE / sizeof(T));
__
nramset
(y_nram, elem_count, (T)0);
__
bang_write_value
(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
...
@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
...
@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count =
int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__
nramset
(y_nram_temp, elem_count, (T)0);
__
bang_write_value
(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c;
int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c;
int y_h_offset = shape_seg.w * align_c;
...
@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
...
@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
int elem_count =
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T));
NFU_ALIGN_SIZE / sizeof(T));
__
nramset
(dx_nram, elem_count, (T)0);
__
bang_write_value
(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
...
@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
...
@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
// fill zeros to dx
// fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
__bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c;
int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
...
...
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
...
@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_
const
(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_
scalar
(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_
const
(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_
scalar
(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_
const
(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_
scalar
(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_
const
(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_mul_
scalar
(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
...
@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
...
@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
} // loop_roi_grid_w
} // loop_roi_grid_w
} // loop_roi_grid_h
} // loop_roi_grid_h
T count_value = (T)(1.0 / count);
T count_value = (T)(1.0 / count);
__bang_mul_
const
(nram_out, nram_out, count_value, align_channel);
__bang_mul_
scalar
(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num
} // loop_cyc_num
}
}
...
@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
...
@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
case CNRT_FLOAT16: {
case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height,
channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio,
input_width, sampling_ratio,
(half)spatial_scale,
(half)spatial_scale,
num_rois);
num_rois);
}; break;
}; break;
case CNRT_FLOAT32: {
case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
...
@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
...
@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
&x_high, &y_low, &y_high);
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w1,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c,
image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c);
(T *)buffer + c_align, c);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w2,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c,
image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c);
(T *)buffer + c_align, c);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w3,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c,
image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c);
(T *)buffer + c_align, c);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w4,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c,
image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c);
(T *)buffer + c_align, c);
...
@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
...
@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
}
}
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM);
GDRAM2NRAM);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w1,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
1 / count, align_c);
__bang_atomic_add(
__bang_atomic_add(
(T *)buffer + align_c,
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
(T *)buffer + align_c, deal_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w2,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
1 / count, align_c);
__bang_atomic_add(
__bang_atomic_add(
(T *)buffer + align_c,
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
(T *)buffer + align_c, deal_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w3,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
1 / count, align_c);
__bang_atomic_add(
__bang_atomic_add(
(T *)buffer + align_c,
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
(T *)buffer + align_c, deal_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w4,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
1 / count, align_c);
__bang_atomic_add(
__bang_atomic_add(
(T *)buffer + align_c,
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once,
image_offset + y_high * wo * c + x_high * c + i * deal_once,
...
...
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
...
@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
}
}
if (is_empty) {
if (is_empty) {
__
nramset
((T *)nram_out, c_slice_align, (T)0);
__
bang_write_value
((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
if (NULL != argmax) {
__
nramset
((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__
bang_write_value
((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
}
}
...
@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
...
@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
for (int i = 0; i < c_slice; i++) {
for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
}
}
__bang_add_
const
((float *)nram_a, (float *)nram_out, (float)bin_y1,
__bang_add_
scalar
((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align);
c_slice_align);
__bang_mul_
const
((float *)nram_ping, (float *)nram_a, (float)width,
__bang_mul_
scalar
((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align);
c_slice_align);
/*compute input_w*/
/*compute input_w*/
__bang_mul_
const
((float *)nram_a, (float *)nram_out, (float)bin_wdim,
__bang_mul_
scalar
((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align);
c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align);
c_slice_align);
__bang_add_
const
((float *)nram_a, (float *)nram_a, (float)bin_x1,
__bang_add_
scalar
((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align);
c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align);
c_slice_align);
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
...
@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
...
@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
rois_num, (float)spatial_scale, (float *)output_data,
rois_num, (float)spatial_scale, (float *)output_data,
argmax);
argmax);
}; break;
}; break;
default: {
default: { break; }
break;
}
}
}
}
}
} // namespace forward
} // namespace forward
...
@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
...
@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
align_c);
align_c);
// Perform 'temp_result - hstart' operation
// Perform 'temp_result - hstart' operation
__bang_sub_
const
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
__bang_sub_
scalar
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c);
align_c);
// Perform 'temp_result1 - temp_result2 * width' operation
// Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_
const
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
__bang_mul_
scalar
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c);
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
(float *)nram_argmax_fp_w, align_c);
(float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation
// Perform 'temp_result - wstart' operation
__bang_sub_
const
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
wstart,
__bang_sub_
scalar
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
align_c);
wstart,
align_c);
// Perform 'temp_result = h * w_compute + w' operation
// Perform 'temp_result = h * w_compute + w' operation
__bang_mul_
const
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
__bang_mul_
scalar
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c);
w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c);
(float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) {
if (loop_flag == 1) {
__bang_sub_
const
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
__bang_sub_
scalar
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c);
(loop_id * true_limit), align_c);
}
}
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
...
@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
...
@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/
*/
// Load the data from GDRAM to NRAM.
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
__memcpy(
(const T *)grads + (n * pooled_height * pooled_width +
(T *)nram_grads + align_c * high_precision,
ph * pooled_width + pw) *
(const T *)grads +
channels,
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels * sizeof(T), GDRAM2NRAM);
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
if (high_precision) {
__bang_half2float((float *)nram_grads,
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
(half *)nram_grads + align_c * high_precision,
align_c);
align_c);
}
}
__memcpy((int32_t *)nram_argmax,
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
(const int32_t *)argmax +
(n * pooled_height * pooled_width +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
ph * pooled_width + pw) *
channels,
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
channels * sizeof(int32_t), GDRAM2NRAM);
// Perform pooling operation on NRAM.
// Perform pooling operation on NRAM.
...
@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
...
@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/
*/
// Load the data from GDRAM to NRAM.
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
__memcpy(
(const T *)grads + (n * pooled_height * pooled_width +
(T *)nram_grads + align_c * high_precision,
ph * pooled_width + pw) *
(const T *)grads +
channels,
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels * sizeof(T), GDRAM2NRAM);
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
if (high_precision) {
__bang_half2float((float *)nram_grads,
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
(half *)nram_grads + align_c * high_precision,
align_c);
align_c);
}
}
__memcpy((int32_t *)nram_argmax,
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
(const int32_t *)argmax +
(n * pooled_height * pooled_width +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
ph * pooled_width + pw) *
channels,
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
channels * sizeof(int32_t), GDRAM2NRAM);
int ping_pong = 0;
int ping_pong = 0;
...
@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
...
@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
height, width, pooled_height, pooled_width, rois_num,
height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision);
(const float)spatial_scale, high_precision);
}; break;
}; break;
default: {
default: { break; }
break;
}
}
}
}
}
} // namespace backward
} // namespace backward
...
...
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
View file @
e847cf8a
...
@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
...
@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
int t_shift = shifts[n_index * group_size + group_id];
int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size +
int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size;
n_index * time_size * channel_size * hw_size;
__
nramset
(data_nram, MAX_NRAM_SIZE, (char)0);
__
bang_write_value
(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
__asm__ volatile("sync;");
if (abs(t_shift) >= time_size) {
if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
...
@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
...
@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
int next_sequence_index =
int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size;
index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
__
nramset
(data_nram, MAX_NRAM_SIZE, (char)0);
__
bang_write_value
(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
__asm__ volatile("sync;");
if (max_number_hw_per_core == 0) {
if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment