Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
e847cf8a
Commit
e847cf8a
authored
Oct 10, 2022
by
bdf
Committed by
Zaida Zhou
Nov 23, 2022
Browse files
[Refactor] Adapt mlu code to cntoolkit3.0.1
parent
4c6e99c8
Changes
9
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
967 additions
and
1092 deletions
+967
-1092
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+20
-20
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+13
-13
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+50
-27
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+241
-940
mmcv/ops/csrc/common/mlu/nms_utils.hpp
mmcv/ops/csrc/common/mlu/nms_utils.hpp
+553
-0
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+7
-7
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+39
-39
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+42
-44
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+2
-2
No files found.
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
View file @
e847cf8a
...
...
@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_
const
(vec_left, vec_left, (T)offset, batches_stride);
__bang_add_
scalar
(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_
const
(vec_right, vec_right, (T)offset, batches_stride);
__bang_add_
scalar
(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__
nramset
(vec_bottom, batches_stride, 0.f);
__
bang_write_value
(vec_bottom, batches_stride, 0.f);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...
...
@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_
const
(vec_top, vec_top, (T)offset, batches_stride);
__bang_add_
scalar
(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_
const
(vec_bottom, vec_bottom, (T)offset, batches_stride);
__bang_add_
scalar
(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
...
...
@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_
const
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_
const
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
...
...
@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height;
// offset vector ---> vec_b2_y1
__
nramset
(vec_b2_y1, batches_stride, T(offset));
__
bang_write_value
(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
...
...
@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram
__
nramset
(vec_b1_x1, batches_stride, bbox1[base1]);
__
nramset
(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__
nramset
(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__
nramset
(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
__
bang_write_value
(vec_b1_x1, batches_stride, bbox1[base1]);
__
bang_write_value
(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__
bang_write_value
(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__
bang_write_value
(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride;
...
...
@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_
const
(vec_left, vec_left, (T)offset, batches_stride);
__bang_add_
scalar
(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_
const
(vec_right, vec_right, (T)offset, batches_stride);
__bang_add_
scalar
(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__
nramset
(vec_bottom, batches_stride, (T)0);
__
bang_write_value
(vec_bottom, batches_stride, (T)0);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
...
...
@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_
const
(vec_top, vec_top, (T)offset, batches_stride);
__bang_add_
scalar
(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_
const
(vec_bottom, vec_bottom, (T)offset, batches_stride);
__bang_add_
scalar
(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
...
...
@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_
const
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_
const
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
__bang_add_
scalar
(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
...
...
@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
T *inter_s = height;
// offset vector ---> vec_b2_y1
__
nramset
(vec_b2_y1, batches_stride, T(offset));
__
bang_write_value
(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
...
...
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
View file @
e847cf8a
...
...
@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
// set output_nram to zero
__
nramset
(output_nram, param.output_nram_size, T(0));
__
bang_write_value
(output_nram, param.output_nram_size, T(0));
// loop blocks of kernel window: grid_dim.(Kh, Kw)
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
...
...
@@ -313,7 +313,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
T *sum = sum_array;
for (int g = 0; g < blkSize.G; ++g) {
__bang_mul_
const
(sum, src, mask_array[mask_index],
__bang_mul_
scalar
(sum, src, mask_array[mask_index],
param.block_Cg_NFU);
//
// NOTE: Since block_Cg_NFU >= block_Cg_stride,
...
...
@@ -446,7 +446,7 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
GDRAM2NRAM);
__bang_mul_
const
((T *)grad_input_buff, (T *)grad_output_buff,
__bang_mul_
scalar
((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, num_align);
...
...
@@ -485,7 +485,7 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
GDRAM2NRAM);
__bang_mul_
const
((T *)grad_input_buff, (T *)grad_output_buff,
__bang_mul_
scalar
((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, rem_for_loop);
...
...
@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
const int wi, const int c, const int k_up,
const int group, const int scale) {
if (dtype == CNRT_FLOAT16) {
backward::MLUUnion1KernelCarafeBackward<half>
<<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up,
group, scale);
backward::MLUUnion1KernelCarafeBackward<half>
<<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up,
group, scale);
} else {
backward::MLUUnion1KernelCarafeBackward<float>
<<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up,
group, scale);
backward::MLUUnion1KernelCarafeBackward<float>
<<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input,
grad_mask, n, hi, wi, c, k_up,
group, scale);
}
}
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
View file @
e847cf8a
...
...
@@ -211,50 +211,51 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
// get sign bit
const
float
move_23bit
=
8388608.0
;
// 0x80000000 = 1,000000000,0000000000000000000000000000
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
src
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
// get 1 or 0 from sign bit
// judg is Odd
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
__bang_cycle_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000001
);
__bang_cycle_eq
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// minus xor, positive num invariant
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
__bang_cycle_mul
(
dst
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_bxor
((
char
*
)
dst
,
(
char
*
)
src
,
(
char
*
)
dst
,
src_count
*
sizeof
(
float
));
// convert int32 to float32
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x7fffff
);
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x7fffff
);
__bang_cycle_band
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x4b000000
);
__bang_cycle_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__bang_sub_
const
(
dst
,
dst
,
move_23bit
,
src_count
);
__bang_sub_
scalar
(
dst
,
dst
,
move_23bit
,
src_count
);
// add one
__bang_add
(
dst
,
dst
,
dst_addition
,
src_count
);
// set sign for float32
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
__bang_cycle_mul
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
__bang_cycle_add
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
4
,
128
);
...
...
@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// dst_addition = abs(src)
__bang_mul
(
dst_addition
,
src
,
(
float
*
)
dst
,
src_count
);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__nramset
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
1.0
f
);
__bang_write_value
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
1.0
f
);
__bang_cycle_lt
(
dst_addition
,
dst_addition
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_add_tz
((
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
dst_addition
,
src_count
);
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xbf800000
);
// set negative flag -1.0 = 0xbf80000
__bang_cycle_eq
(
(
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// to mark all src in [x<-1.0]
__bang_active_abs
(
dst_addition
,
src
,
src_count
);
__nramset
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
8388608.0
f
);
__bang_write_value
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
8388608.0
f
);
// mask shift move 23
__bang_cycle_add_tz
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
...
...
@@ -314,11 +317,11 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value.
__bang_mul_
const
((
float
*
)
dst
,
(
float
*
)
dst
,
16777215.0
,
src_count
);
__bang_mul_
scalar
((
float
*
)
dst
,
(
float
*
)
dst
,
16777215.0
,
src_count
);
__bang_bxor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
dst
,
src_count
*
floatDchar
);
// get low 23bit
__
nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
__
bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
(
unsigned
)
0x007fffff
);
// mask low 23bit is 1
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
...
...
@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
// set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__
nramset
(
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x3f800000
);
__
bang_write_value
(
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x3f800000
);
__bang_cycle_and
((
float
*
)
dst
,
(
float
*
)
dst
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// src or dst_addition
__bang_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
__bang_mul_
const
((
float
*
)
dst
,
(
float
*
)
dst
,
-
2.0
,
src_count
);
__bang_mul_
scalar
((
float
*
)
dst
,
(
float
*
)
dst
,
-
2.0
,
src_count
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
#endif // __BANG_ARCH__ >= 300
}
/*!
* @brief Converts float32 to half data type,
* the rounding mode on MLU200 is rd, on MLU300 is rn.
*
* @param[out] dst
* Pointer to NRAM that stores half type data.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
inline
void
convertFloat2half
(
half
*
dst
,
float
*
src
,
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_float2half_rn
(
dst
,
src
,
src_count
);
#else
__bang_float2half_rd
(
dst
,
src
,
src_count
);
#endif
}
#endif // COMMON_MLU_HELPER_HPP_
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
View file @
e847cf8a
This diff is collapsed.
Click to expand it.
mmcv/ops/csrc/common/mlu/nms_utils.hpp
0 → 100644
View file @
e847cf8a
This diff is collapsed.
Click to expand it.
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
View file @
e847cf8a
...
...
@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst,
src + position.n_start * n_offset + position.h_start * h_offset +
position.w_start * w_offset,
__memcpy(dst, src + position.n_start * n_offset +
position.h_start * h_offset + position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1);
}
...
...
@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T));
__
nramset
(y_nram, elem_count, (T)0);
__
bang_write_value
(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
...
...
@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__
nramset
(y_nram_temp, elem_count, (T)0);
__
bang_write_value
(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c;
...
...
@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T));
__
nramset
(dx_nram, elem_count, (T)0);
__
bang_write_value
(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
...
...
@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
// fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
__bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
...
...
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
View file @
e847cf8a
...
...
@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_
const
(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_
const
(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_
const
(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_
const
(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_mul_
scalar
(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_
scalar
(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_
scalar
(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_
scalar
(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
...
...
@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
} // loop_roi_grid_w
} // loop_roi_grid_h
T count_value = (T)(1.0 / count);
__bang_mul_
const
(nram_out, nram_out, count_value, align_channel);
__bang_mul_
scalar
(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num
}
...
...
@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio,
(half)spatial_scale,
num_rois);
input_width, sampling_ratio,
(half)spatial_scale,
num_rois);
}; break;
case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
...
...
@@ -346,30 +346,30 @@ __mlu_func__ void unionRoiAlignBp(
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w1,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w2,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w3,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer, (T)w4,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
__bang_mul_
const
((T *)buffer + c_align, (T *)buffer + c_align,
__bang_mul_
scalar
((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c,
...
...
@@ -401,33 +401,33 @@ __mlu_func__ void unionRoiAlignBp(
}
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w1,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w2,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w3,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer, (T)w4,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
__bang_mul_
const
((T *)buffer + align_c, (T *)buffer + align_c,
__bang_mul_
scalar
((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
...
...
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
View file @
e847cf8a
...
...
@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
}
if (is_empty) {
__
nramset
((T *)nram_out, c_slice_align, (T)0);
__
bang_write_value
((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
__
nramset
((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__
bang_write_value
((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
}
...
...
@@ -238,17 +238,17 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
}
__bang_add_
const
((float *)nram_a, (float *)nram_out, (float)bin_y1,
__bang_add_
scalar
((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align);
__bang_mul_
const
((float *)nram_ping, (float *)nram_a, (float)width,
__bang_mul_
scalar
((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align);
/*compute input_w*/
__bang_mul_
const
((float *)nram_a, (float *)nram_out, (float)bin_wdim,
__bang_mul_
scalar
((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align);
__bang_add_
const
((float *)nram_a, (float *)nram_a, (float)bin_x1,
__bang_add_
scalar
((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align);
...
...
@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
rois_num, (float)spatial_scale, (float *)output_data,
argmax);
}; break;
default: {
break;
}
default: { break; }
}
}
} // namespace forward
...
...
@@ -328,11 +326,11 @@ __mlu_func__ void convertIndex(
align_c);
// Perform 'temp_result - hstart' operation
__bang_sub_
const
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
__bang_sub_
scalar
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c);
// Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_
const
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
__bang_mul_
scalar
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
...
...
@@ -340,17 +338,17 @@ __mlu_func__ void convertIndex(
(float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation
__bang_sub_
const
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
wstart,
align_c);
__bang_sub_
scalar
((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
wstart,
align_c);
// Perform 'temp_result = h * w_compute + w' operation
__bang_mul_
const
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
__bang_mul_
scalar
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) {
__bang_sub_
const
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
__bang_sub_
scalar
((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c);
}
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
...
...
@@ -460,9 +458,10 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
(const T *)grads + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
__memcpy(
(T *)nram_grads + align_c * high_precision,
(const T *)grads +
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
...
...
@@ -471,8 +470,8 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
align_c);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
(n * pooled_height * pooled_width +
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
...
...
@@ -523,9 +522,10 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
*/
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
(const T *)grads + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
__memcpy(
(T *)nram_grads + align_c * high_precision,
(const T *)grads +
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
...
...
@@ -533,8 +533,8 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
(n * pooled_height * pooled_width +
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
...
...
@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision);
}; break;
default: {
break;
}
default: { break; }
}
}
} // namespace backward
...
...
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
View file @
e847cf8a
...
...
@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size;
__
nramset
(data_nram, MAX_NRAM_SIZE, (char)0);
__
bang_write_value
(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
...
...
@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
__
nramset
(data_nram, MAX_NRAM_SIZE, (char)0);
__
bang_write_value
(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment