Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
91da9643
Commit
91da9643
authored
Aug 13, 2024
by
limm
Browse files
support v2.1.0
parent
6f674c7e
Changes
139
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
67 additions
and
7350 deletions
+67
-7350
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
+0
-431
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
+0
-695
mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
+0
-853
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+0
-483
mmcv/ops/csrc/common/mlu/nms_utils.hpp
mmcv/ops/csrc/common/mlu/nms_utils.hpp
+0
-553
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+0
-615
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
+0
-55
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+0
-493
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
+0
-490
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
+0
-24
mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
+0
-747
mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
...common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
+0
-536
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
+0
-544
mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
+0
-466
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+0
-307
mmcv/ops/csrc/common/pytorch_npu_helper.hpp
mmcv/ops/csrc/common/pytorch_npu_helper.hpp
+13
-1
mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
+3
-2
mmcv/ops/csrc/parrots/cudabind.cpp
mmcv/ops/csrc/parrots/cudabind.cpp
+0
-51
mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+46
-0
mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
+5
-4
No files found.
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "iou3d_utils.hpp"
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
/* NRAM buffer
* Suppose deal N boxes once time.
----------------------------------------------------------------
| Basic |score (1N)+ |intersect_pts(48N)| |
| |valid_box(1N) |+ ordered_pts(48N)| temp_long(72N) |
| |+ temp_buffer(10N)| | |
|--------------------------|------------------|----------------|
| Reuse | null | null |rotated_pts(16N)|
|-------|------------------|------------------|----------------|
---------------------------------------------------------------------------
| Basic | dist_ram(24N) | valid_pts(24N) |box1(5N) |box1_buffer(5KB) |
| | |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB) |
|--------------------------|-----------------|----------|-----------------|
| Reuse | vec_buffer(5N) | null | null | null |
|-------|------------------|-----------------|----------|-----------------|
Total Basic Memory Size = 239N * sizeof(float) + 10KB
*/
__nram__ char nram_buffer[MAX_NRAM_SIZE];
__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
template <typename T>
__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
const T *boxes_data, float *scores_data,
const int core_limit, const int input_box_num,
const float iou_threshold,
mluMemcpyDirection_t scores_load_dir,
mluMemcpyDirection_t scores_store_dir,
mluMemcpyDirection_t boxes_load_dir) {
// NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
const int nram_save_limit_count = 256;
int box_read_limit_count = 256;
float div_thresh_iou = 1.0 / iou_threshold;
// every box require 239 * sizeof(float) space in nram;
const int32_t copies_of_nram = 239 * sizeof(float);
const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
nram_save_limit_count * sizeof(int32_t)) /
copies_of_nram;
// x,y,z,dx,dy,dz,angle
const T *input_x_ptr = boxes_data;
const T *input_y_ptr = input_x_ptr + input_box_num;
const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
const T *input_dy_ptr = input_dx_ptr + input_box_num;
const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
float *input_score_ptr = scores_data;
// data split
int avg_cluster = 0;
int rem_cluster = 0;
int len_cluster = 0;
int cluster_offset = 0;
if (clusterDim > 0) {
// union
avg_cluster = input_box_num / clusterDim;
rem_cluster = input_box_num % clusterDim;
len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
} else {
// block
len_cluster = input_box_num;
cluster_offset = 0;
}
int len_core = input_box_num;
int input_offset = 0;
if (core_limit > 1) {
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
input_offset = cluster_offset + core_offset;
}
int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
int repeat_iou_compute = len_core / max_seg_pad;
int remain_iou_compute = len_core % max_seg_pad;
// basic consistent memory layout
void *score = ((char *)nram_buffer);
void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
void *intersect_pts_x =
((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
void *intersect_pts_y =
((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
void *ordered_pts_x =
((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
void *ordered_pts_y =
((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
void *temp_long_1 =
((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
int32_t *nram_save =
(int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
// nram_save ~ nram_save_limit_count * sizeof(int32_t)
int nram_save_count = 0;
// reuse memory
void *rotated_pts1_x = ((char *)dist_ram);
void *rotated_pts1_y =
((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
void *rotated_pts2_x =
((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
void *rotated_pts2_y =
((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
// vec_buffer ~ 16 * max_seg_pad * sizeof(float)
// First, initialize ram with all 0, or could cause nan/inf unexcepted results
__bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
// number 8 and 0xff relay on box_read_limit_count initial as 256
const int max_box_seg_id = (input_box_num - 1) >> 8;
const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
__sync_all();
int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
: box_read_limit_count;
if (box_id == 0) {
// x,y,z,dx,dy,dz,angle
int offset_num = box_seg_id << 8;
// x
__memcpy((char *)box1_buffer, input_x_ptr + offset_num,
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// y
__memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// dx
__memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// dy
__memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// angle
__memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
input_angle_ptr + offset_num,
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
}
if (((float *)input_score_ptr)[cur_box] == 0) {
continue;
}
// save result
nram_save[nram_save_count] = cur_box;
result_box_num++;
nram_save_count++;
if (clusterId == 0 && coreId == 0 &&
nram_save_count == nram_save_limit_count) {
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
NRAM2GDRAM);
pvUnlock();
output_data += nram_save_count;
nram_save_count = 0;
}
// prepare box1
// x
__bang_write_value((float *)box1, max_seg_pad,
float(((T *)box1_buffer)[box_id]));
// y
__bang_write_value(
(float *)box1 + max_seg_pad, max_seg_pad,
float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
// dx
__bang_write_value(
(float *)box1 + max_seg_pad * 2, max_seg_pad,
float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
// dy
__bang_write_value(
(float *)box1 + max_seg_pad * 3, max_seg_pad,
float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
// angle
__bang_write_value(
(float *)box1 + max_seg_pad * 4, max_seg_pad,
float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
float max_area = 1.0f *
((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
// update score
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = max_seg_pad;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
// int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
// score
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(float), scores_load_dir,
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
// x
__memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// y
__memcpy(box2 + half_offset + seg_len * 1,
input_y_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// dx
__memcpy(box2 + half_offset + seg_len * 2,
input_dx_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// dy
__memcpy(box2 + half_offset + seg_len * 3,
input_dy_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// angle
__memcpy(box2 + half_offset + seg_len * 4,
input_angle_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// if (std::is_same<T, half>::value) {
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)box2, (half *)(box2 + half_offset),
seg_len * 5);
}
// Calculate rotated vertices
void *temp1_ram = ((char *)temp_buffer);
void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
(float *)box1, (float *)temp1_ram, (float *)temp2_ram,
(float *)temp3_ram, (float *)temp4_ram, seg_len);
getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
(float *)box2, (float *)temp1_ram, (float *)temp2_ram,
(float *)temp3_ram, (float *)temp4_ram, seg_len);
__bang_write_zero((float *)valid_pts, 24 * seg_len);
__bang_write_zero((float *)nums_in_ram, seg_len);
__bang_write_value(((float *)valid_box), seg_len, 1.0f);
void *vec1_x = ((char *)vec_buffer);
void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
// Get all intersection points
getIntersectPts(
(float *)rotated_pts1_x, (float *)rotated_pts1_y,
(float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
(float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
(float *)intersect_pts_x, (float *)intersect_pts_y,
(float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
(float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
(float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
(float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
// Where nums_in <= 2, set valid_box to false
__bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
__bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
(float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
__bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
seg_len);
__bang_cycle_and((float *)valid_pts, (float *)valid_pts,
(float *)valid_box, 24 * seg_len, seg_len);
// Convex-hull-graham to order the intersection points in clockwise order
// and find the contour area
convexHullGraham(
(float *)intersect_pts_x, (float *)intersect_pts_y,
(float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
(float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
seg_len, seg_len);
// Calculate polygon area
// set temp1 = intersection part area
polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
(float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
(float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
seg_len);
// area
__bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
(float *)box2 + seg_len * 3, seg_len);
// get the area_U: area + max_area - area_I
__bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
seg_len);
__bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
seg_len); // area_U
if (iou_threshold > 0.0) {
__bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
div_thresh_iou, seg_len);
} else {
__bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
seg_len);
}
__bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
cpy_len * sizeof(float), scores_store_dir,
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
pvUnlock();
}
}
if (clusterId == 0 && coreId == 0 && nram_save_count) {
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
NRAM2GDRAM);
pvUnlock();
}
}
__mlu_global__ void MLUBlockorUnionIKernelOU3D(
const void *input_boxes, const int input_box_num, const float iou_threshold,
const cnrtDataType_t data_type_input, void *workspace, void *result_num,
void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
float *scores_data = (float *)workspace;
float *boxes_data = (float *)input_boxes;
const int cluster_score_size = input_box_num * sizeof(float);
const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
char *sram_score = (char *)sram_buffer;
char *sram_boxes = (char *)sram_buffer + cluster_score_size;
if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
scores_data = (float *)sram_score;
scores_load_dir = SRAM2NRAM;
scores_store_dir = NRAM2SRAM;
if (coreId == 0x80) {
__sramset((void *)sram_buffer, input_box_num, 1.0f);
}
} else {
if (coreId == 0) {
__gdramset(scores_data, input_box_num, 1.0f);
}
}
if (clusterDim == 1 &&
SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
boxes_load_dir = SRAM2NRAM;
boxes_data = (float *)sram_boxes;
if (coreId == 0x80) {
__memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
GDRAM2SRAM);
}
}
__sync_cluster();
int32_t result_box_num = 0;
int32_t *out_data = (int32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
taskDim, input_box_num, iou_threshold, scores_load_dir,
scores_store_dir, boxes_load_dir);
}; break;
case CNRT_FLOAT32: {
iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
taskDim, input_box_num, iou_threshold, scores_load_dir,
scores_store_dir, boxes_load_dir);
}; break;
}
((int32_t *)result_num)[0] = result_box_num;
}
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_dram,
const int input_box_num, const float iou_threshold,
void *workspace, void *output_size, void *output) {
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1:
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION4:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
(void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
workspace, output_size, output);
}; break;
}
}
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef IOU3D_UTILS_HPP_
#define IOU3D_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define IOU3D_SIZE 64
#define IOU3D_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define IOU3D_DOWN(x, y) (x / y) * y
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
#define COMPUTE_COUNT_ALIGN 64
#define INFO_NUM (5) // score, x1, y1, x2, y2
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SINGLE_BOX_DIM 5
#define MEMORY_CORE (0x80)
__mlu_func__
void
pvLock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_lock
(
0
,
0
);
}
#endif
}
__mlu_func__
void
pvUnlock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_unlock
(
0
,
0
);
}
#endif
}
// cross2d<T>(A, B) = A.x * B.y - A.y * B.x;
template
<
typename
T
>
inline
__mlu_func__
void
cross2d
(
T
*
result
,
const
T
*
p1_x
,
const
T
*
p1_y
,
const
T
*
p2_x
,
const
T
*
p2_y
,
const
int
&
length
,
T
*
temp_ram
)
{
__bang_mul
((
T
*
)
temp_ram
,
(
T
*
)
p1_x
,
(
T
*
)
p2_y
,
length
);
__bang_mul
((
T
*
)
result
,
(
T
*
)
p1_y
,
(
T
*
)
p2_x
,
length
);
__bang_sub
((
T
*
)
result
,
(
T
*
)
temp_ram
,
(
T
*
)
result
,
length
);
}
// dot2d<T>(A, B) = A.x * B.x + A.y * B.y
template
<
typename
T
>
inline
__mlu_func__
void
dot2d
(
T
*
result
,
const
T
*
p1_x
,
const
T
*
p1_y
,
const
T
*
p2_x
,
const
T
*
p2_y
,
const
int
&
length
,
T
*
temp_ram
)
{
__bang_mul
((
T
*
)
temp_ram
,
(
T
*
)
p1_x
,
(
T
*
)
p2_x
,
length
);
__bang_mul
((
T
*
)
result
,
(
T
*
)
p1_y
,
(
T
*
)
p2_y
,
length
);
__bang_add
((
T
*
)
result
,
(
T
*
)
temp_ram
,
(
T
*
)
result
,
length
);
}
template
<
typename
T
>
__mlu_func__
void
getRotatedVertices
(
T
*
pts_x
,
T
*
pts_y
,
T
*
box
,
T
*
temp1
,
T
*
temp2
,
T
*
temp3
,
T
*
temp4
,
const
uint32_t
&
actual_compute_box_num
)
{
// T cosTheta2 = (T)cos(theta) * 0.5f; -- temp1
// T sinTheta2 = (T)sin(theta) * 0.5f; -- temp2
// theta is the box's 5th data: a, rotated radian;
#if __BANG_ARCH__ >= 300
__bang_cos
((
float
*
)
temp1
,
((
float
*
)
box
)
+
4
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sin
((
float
*
)
temp2
,
((
float
*
)
box
)
+
4
*
actual_compute_box_num
,
actual_compute_box_num
);
#else
__bang_taylor4_cos
((
T
*
)
temp1
,
((
T
*
)
box
)
+
4
*
actual_compute_box_num
,
(
T
*
)
temp3
,
(
T
*
)
temp4
,
actual_compute_box_num
);
__bang_taylor4_sin
((
T
*
)
temp2
,
((
T
*
)
box
)
+
4
*
actual_compute_box_num
,
(
T
*
)
temp3
,
(
T
*
)
temp4
,
actual_compute_box_num
);
#endif
__bang_mul_scalar
((
T
*
)
temp1
,
(
T
*
)
temp1
,
(
T
)
0.5
,
actual_compute_box_num
);
__bang_mul_scalar
((
T
*
)
temp2
,
(
T
*
)
temp2
,
(
T
)
0.5
,
actual_compute_box_num
);
// Temp3 = sinTheta2 * box.h;
// Temp4 = cosTheta2 * box.w;
__bang_mul
((
T
*
)
temp3
,
(
T
*
)
temp2
,
((
T
*
)
box
)
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp4
,
(
T
*
)
temp1
,
((
T
*
)
box
)
+
2
*
actual_compute_box_num
,
actual_compute_box_num
);
// pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
// pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
__bang_sub
((
T
*
)
pts_x
,
(
T
*
)
box
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
,
(
T
*
)
pts_x
,
(
T
*
)
temp4
,
actual_compute_box_num
);
__bang_add
((
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
(
T
*
)
box
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
(
T
*
)
temp4
,
actual_compute_box_num
);
// Temp3 = cosTheta2 * box.h;
// Temp4 = sinTheta2 * box.w;
__bang_mul
((
T
*
)
temp3
,
(
T
*
)
temp1
,
box
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp4
,
(
T
*
)
temp2
,
box
+
2
*
actual_compute_box_num
,
actual_compute_box_num
);
// pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
// pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
__bang_add
((
T
*
)
pts_y
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
,
(
T
*
)
pts_y
,
(
T
*
)
temp4
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
(
T
*
)
temp4
,
actual_compute_box_num
);
// pts[2].x = 2 * box.x_ctr - pts[0].x;
// pts[3].x = 2 * box.x_ctr - pts[1].x;
__bang_add
((
T
*
)
pts_x
+
2
*
actual_compute_box_num
,
(
T
*
)
box
,
(
T
*
)
box
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_x
,
actual_compute_box_num
);
__bang_add
((
T
*
)
pts_x
+
3
*
actual_compute_box_num
,
(
T
*
)
box
,
(
T
*
)
box
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
// pts[2].y = 2 * box.y_ctr - pts[0].y;
// pts[3].y = 2 * box.y_ctr - pts[1].y;
__bang_add
((
T
*
)
pts_y
+
2
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_y
,
actual_compute_box_num
);
__bang_add
((
T
*
)
pts_y
+
3
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
}
template
<
typename
T
>
__mlu_func__
void
getIntersectPts
(
T
*
rotated_pts1_x
,
T
*
rotated_pts1_y
,
T
*
rotated_pts2_x
,
T
*
rotated_pts2_y
,
T
*
vec1_x
,
T
*
vec1_y
,
T
*
vec2_x
,
T
*
vec2_y
,
T
*
intersect_pts_x
,
T
*
intersect_pts_y
,
T
*
valid_pts
,
T
*
nums_in_ram
,
T
*
temp1_ram
,
T
*
temp2_ram
,
T
*
temp3_ram
,
T
*
temp4_ram
,
T
*
temp5_ram
,
T
*
temp6_ram
,
T
*
temp7_ram
,
T
*
temp8_ram
,
T
*
temp9_ram
,
T
*
temp10_ram
,
const
uint32_t
&
actual_compute_box_num
)
{
// Initialize const data to ram
// temp3 = const 1e-14(@float), length = COMPUTE_COUNT_ALIGN
#if __BANG_ARCH__ >= 300
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
1e-14
);
#else
// NOTE: Since active_reciphp function has strict value range,
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
(
float
)
1e-14
);
#endif
// temp4 = const T(0), length = COMPUTE_COUNT_ALIGN
__bang_write_value
((
T
*
)
temp4_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
0
);
// temp5 = const T(1), length = COMPUTE_COUNT_ALIGN
__bang_write_value
((
T
*
)
temp5_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
1
);
// Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1]
// for i = 0~3, vec[i] = pts[(i+1)%4] - pts[i]
__bang_sub
((
T
*
)
vec1_x
,
(
T
*
)
rotated_pts1_x
+
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
,
(
T
*
)
rotated_pts1_x
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec1_y
,
(
T
*
)
rotated_pts1_y
+
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
,
(
T
*
)
rotated_pts1_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_x
,
(
T
*
)
rotated_pts2_x
+
actual_compute_box_num
,
(
T
*
)
rotated_pts2_x
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_x
,
(
T
*
)
rotated_pts2_x
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_y
,
(
T
*
)
rotated_pts2_y
+
actual_compute_box_num
,
(
T
*
)
rotated_pts2_y
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_y
,
(
T
*
)
rotated_pts2_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
// First, line test - test all line combos for intersection, 4x4 possible
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
// T det = cross2d<T>(vec2[j], vec1[i]) -- temp2
cross2d
<
T
>
((
T
*
)
temp2_ram
,
(
T
*
)
vec2_x
+
j
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
j
*
actual_compute_box_num
,
(
T
*
)
vec1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp1_ram
);
// temp8 = sign(det), since active_reciphp only receive positive values
__bang_active_sign
((
T
*
)
temp8_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// deal with parallel lines, temp2 = fabs(det), temp1 = temp2 > 1e-14
__bang_active_abs
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_cycle_gt
((
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp3_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
// Where temp1 = false, set recip input to 1, avoiding recip(0), cause inf
__bang_not
((
T
*
)
temp9_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
// temp2 = 1/temp2, use mult (1/temp2) instead of div temp2
#if __BANG_ARCH__ >= 300
__bang_recip
((
float
*
)
temp2_ram
,
(
float
*
)
temp2_ram
,
actual_compute_box_num
);
#else
// NOTE: active_reciphp function has strict value range:
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
__bang_active_reciphp
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
#endif
// Restore temp2 invalid box value 1 and sign-bit
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
// auto vec12 = pts2[j] - pts1[i], (temp6, temp7) = (x, y)
__bang_sub
((
T
*
)
temp6_ram
,
(
T
*
)
rotated_pts2_x
+
j
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
rotated_pts2_y
+
j
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
// T t1 = cross2d<T>(vec2[j], vec12) mult (1/det) -- temp8
cross2d
<
T
>
((
T
*
)
temp8_ram
,
(
T
*
)
vec2_x
+
j
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
j
*
actual_compute_box_num
,
(
T
*
)
temp6_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
__bang_mul
((
T
*
)
temp8_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// temp1 &= (t1 >= 0.0f && t1 <= 1.0f) -- temp9
__bang_cycle_ge
((
T
*
)
temp9_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_cycle_le
((
T
*
)
temp9_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
// T t2 = cross2d<T>(vec1[i], vec12) mult temp2 -- temp9
// NOTE: temp8(t1) is used after, reuse temp7(p2_y) as cross2d temp ram
cross2d
<
T
>
((
T
*
)
temp9_ram
,
(
T
*
)
vec1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
temp6_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
,
(
T
*
)
temp7_ram
);
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// temp1 &= (t2 >= 0.0f && t2 <= 1.0f) -- temp9
__bang_cycle_ge
((
T
*
)
temp7_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
);
__bang_cycle_le
((
T
*
)
temp7_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
);
// intersections = (pts1[i] + vec1[i] * t1) * temp1
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
vec1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp9_ram
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_x
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
vec1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp9_ram
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_y
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
// Assign `valid_pts` bit and accumulate `nums_in` of valid points of each
// box pair
__bang_or
((
T
*
)
valid_pts
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
valid_pts
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
nums_in_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
}
}
// Check for vertices of rect1 inside rect2
// temp5 = ABdotAB
dot2d
<
T
>
((
T
*
)
temp5_ram
,
(
T
*
)
vec2_x
,
(
T
*
)
vec2_y
,
(
T
*
)
vec2_x
,
(
T
*
)
vec2_y
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
// temp6 = ADdotAD
dot2d
<
T
>
((
T
*
)
temp6_ram
,
(
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lines within AB
// and P's projection on AD lies within AD
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// AP = pts1[i] - pts2[0] = (temp7, temp8)
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp8_ram
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_y
,
actual_compute_box_num
);
// temp9 = APdotAB = dot2d<T>(AP, AB)
dot2d
<
T
>
((
T
*
)
temp9_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec2_x
,
(
T
*
)
vec2_y
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
// temp10 = APdotAD = -dot2d<T>(AP, DA)
dot2d
<
T
>
((
T
*
)
temp10_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
__bang_mul_scalar
((
T
*
)
temp10_ram
,
(
T
*
)
temp10_ram
,
(
T
)
-
1
,
actual_compute_box_num
);
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
// ADdotAD))
__bang_cycle_ge
((
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_cycle_ge
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp6_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// 16 means the 4x4 possible intersection points above
__bang_mul
((
T
*
)
intersect_pts_x
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_y
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
// assign valid_pts bit and accumulate nums of valid points of each box pair
__bang_or
((
T
*
)
valid_pts
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
valid_pts
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
nums_in_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
}
// Reverse the check - check for vertices of rect2 inside rect1
// temp5 = ABdotAB
dot2d
<
T
>
((
T
*
)
temp5_ram
,
(
T
*
)
vec1_x
,
(
T
*
)
vec1_y
,
(
T
*
)
vec1_x
,
(
T
*
)
vec1_y
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
// temp6 = ADdotAD
dot2d
<
T
>
((
T
*
)
temp6_ram
,
(
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// AP = pts2[i] - pts1[0] = (temp7, temp8)
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
rotated_pts2_x
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp8_ram
,
(
T
*
)
rotated_pts2_y
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
,
actual_compute_box_num
);
// temp9 = APdotAB = dot2d<T>(AP, AB)
dot2d
<
T
>
((
T
*
)
temp9_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec1_x
,
(
T
*
)
vec1_y
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
// temp10 = APdotAD = -dot2d<T>(AP, DA)
dot2d
<
T
>
((
T
*
)
temp10_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
__bang_mul_scalar
((
T
*
)
temp10_ram
,
(
T
*
)
temp10_ram
,
(
T
)
-
1
,
actual_compute_box_num
);
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
// ADdotAD))
__bang_cycle_ge
((
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_cycle_ge
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp6_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// 20 means the (4x4+4) possible intersection points above
__bang_mul
((
T
*
)
intersect_pts_x
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts2_x
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_y
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts2_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
// assign valid_pts bit and accumulate nums of valid points of each box pair
__bang_or
((
T
*
)
valid_pts
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
valid_pts
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
nums_in_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
}
}
template
<
typename
T
>
__mlu_func__
void
convexHullGraham
(
T
*
intersect_pts_x
,
T
*
intersect_pts_y
,
T
*
ordered_pts_x
,
T
*
ordered_pts_y
,
T
*
dist_ram
,
T
*
valid_box
,
T
*
valid_pts
,
T
*
nums_in_ram
,
T
*
temp1_ram
,
T
*
temp2_ram
,
T
*
temp3_ram
,
T
*
temp_long_1
,
T
*
temp_long_2
,
T
*
temp_long_3
,
const
uint32_t
&
actual_box_num
,
const
uint32_t
&
actual_compute_box_num
)
{
// Step1. Find the point with minimum y, if more than 1 points have the same
// minimum y,
// pick the one with the minimum x.
// set p[i].y to max_y_value if not valid_pts, to avoid invalid result
// 24 means all possible intersection points
__bang_max
((
T
*
)
temp2_ram
,
(
T
*
)
intersect_pts_y
,
24
*
actual_compute_box_num
);
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
((
T
*
)
temp2_ram
)[
0
]);
__bang_not
((
T
*
)
temp_long_1
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_cycle_mul
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_1
,
(
T
*
)
temp3_ram
,
24
*
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_mul
((
T
*
)
temp_long_2
,
(
T
*
)
intersect_pts_y
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_add
((
T
*
)
temp_long_2
,
(
T
*
)
temp_long_2
,
(
T
*
)
temp_long_1
,
24
*
actual_compute_box_num
);
// temp2 = min_y_value(temp_long_2), use min_pool, channel=box_num, h=1, w=24
__bang_minpool
((
T
*
)
temp2_ram
,
(
T
*
)
temp_long_2
,
actual_compute_box_num
,
1
,
24
,
1
,
24
,
1
,
24
);
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
valid_box
,
actual_compute_box_num
);
// set p[i].x to max_x_value if not min_y point
__bang_max
((
T
*
)
temp1_ram
,
(
T
*
)
intersect_pts_x
,
24
*
actual_compute_box_num
);
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
((
T
*
)
temp1_ram
)[
0
]);
__bang_cycle_eq
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_2
,
(
T
*
)
temp2_ram
,
24
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_1
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_not
((
T
*
)
temp_long_3
,
(
T
*
)
temp_long_1
,
24
*
actual_compute_box_num
);
__bang_cycle_mul
((
T
*
)
temp_long_3
,
(
T
*
)
temp_long_3
,
(
T
*
)
temp3_ram
,
24
*
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_mul
((
T
*
)
temp_long_1
,
(
T
*
)
intersect_pts_x
,
(
T
*
)
temp_long_1
,
24
*
actual_compute_box_num
);
__bang_add
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_1
,
(
T
*
)
temp_long_3
,
24
*
actual_compute_box_num
);
// temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1, w=24
__bang_minpool
((
T
*
)
temp3_ram
,
(
T
*
)
temp_long_1
,
actual_compute_box_num
,
1
,
24
,
1
,
24
,
1
,
24
);
__bang_mul
((
T
*
)
temp3_ram
,
(
T
*
)
temp3_ram
,
(
T
*
)
valid_box
,
actual_compute_box_num
);
// Step2. All points subtract starting-point (for sorting in the next step)
__bang_cycle_sub
((
T
*
)
ordered_pts_x
,
(
T
*
)
intersect_pts_x
,
(
T
*
)
temp3_ram
,
24
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_cycle_sub
((
T
*
)
ordered_pts_y
,
(
T
*
)
intersect_pts_y
,
(
T
*
)
temp2_ram
,
24
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
ordered_pts_x
,
(
T
*
)
ordered_pts_x
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_mul
((
T
*
)
ordered_pts_y
,
(
T
*
)
ordered_pts_y
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
// Step3. Sort every intersection point according to their relative
// cross-product values (essentially sorting according to angles)
// If the angles are the same, sort according to distance to origin
dot2d
<
T
>
((
T
*
)
dist_ram
,
(
T
*
)
ordered_pts_x
,
(
T
*
)
ordered_pts_y
,
(
T
*
)
ordered_pts_x
,
(
T
*
)
ordered_pts_y
,
24
*
actual_compute_box_num
,
(
T
*
)
temp_long_3
);
T
temp
,
temp_nums_in
,
temp_dist_1
,
temp_dist_2
;
T
temp1_x
,
temp1_y
;
T
temp2_x
,
temp2_y
;
for
(
int
i
=
0
;
i
<
actual_box_num
;
i
++
)
{
if
(((
T
*
)
valid_box
)[
i
])
{
// make sure all nums_in[i] points are at the front
for
(
int
ii
=
0
;
ii
<
23
;
ii
++
)
{
for
(
int
jj
=
ii
+
1
;
jj
<
24
;
jj
++
)
{
int
ii_index
=
ii
*
actual_compute_box_num
+
i
;
int
jj_index
=
jj
*
actual_compute_box_num
+
i
;
// ii point is not valid and jj point is valid, swap jj for ii
if
((
!
((
T
*
)
valid_pts
)[
ii_index
])
&&
((
T
*
)
valid_pts
)[
jj_index
])
{
((
T
*
)
ordered_pts_x
)[
ii_index
]
=
((
T
*
)
ordered_pts_x
)[
jj_index
];
((
T
*
)
ordered_pts_y
)[
ii_index
]
=
((
T
*
)
ordered_pts_y
)[
jj_index
];
((
T
*
)
dist_ram
)[
ii_index
]
=
((
T
*
)
dist_ram
)[
jj_index
];
((
T
*
)
valid_pts
)[
ii_index
]
=
true
;
((
T
*
)
ordered_pts_x
)[
jj_index
]
=
0
;
((
T
*
)
ordered_pts_y
)[
jj_index
]
=
0
;
((
T
*
)
dist_ram
)[
jj_index
]
=
0
;
((
T
*
)
valid_pts
)[
jj_index
]
=
false
;
break
;
}
}
}
temp_nums_in
=
((
T
*
)
nums_in_ram
)[
i
];
// make original q[0] = min_x, min_y before sort
for
(
int
ii
=
1
;
ii
<
temp_nums_in
;
ii
++
)
{
int
ii_index
=
ii
*
actual_compute_box_num
+
i
;
if
(((
T
*
)
dist_ram
)[
ii_index
]
==
0
)
{
// swap q[ii_index] and q[0]
((
T
*
)
ordered_pts_x
)[
ii_index
]
=
((
T
*
)
ordered_pts_x
)[
i
];
((
T
*
)
ordered_pts_y
)[
ii_index
]
=
((
T
*
)
ordered_pts_y
)[
i
];
((
T
*
)
dist_ram
)[
ii_index
]
=
((
T
*
)
dist_ram
)[
i
];
((
T
*
)
ordered_pts_x
)[
i
]
=
0
;
((
T
*
)
ordered_pts_y
)[
i
]
=
0
;
((
T
*
)
dist_ram
)[
i
]
=
0
;
break
;
}
}
for
(
int
ii
=
1
;
ii
<
temp_nums_in
-
1
;
ii
++
)
{
for
(
int
jj
=
ii
+
1
;
jj
<
temp_nums_in
;
jj
++
)
{
int
ii_index
=
ii
*
actual_compute_box_num
+
i
;
int
jj_index
=
jj
*
actual_compute_box_num
+
i
;
temp1_x
=
((
T
*
)
ordered_pts_x
)[
ii_index
];
temp1_y
=
((
T
*
)
ordered_pts_y
)[
ii_index
];
temp2_x
=
((
T
*
)
ordered_pts_x
)[
jj_index
];
temp2_y
=
((
T
*
)
ordered_pts_y
)[
jj_index
];
// calculate cross product and sort q (ordered_pts)
temp
=
(
temp1_x
*
temp2_y
)
-
(
temp1_y
*
temp2_x
);
temp_dist_1
=
((
T
*
)
dist_ram
)[
ii_index
];
temp_dist_2
=
((
T
*
)
dist_ram
)[
jj_index
];
if
((
temp
<
(
T
)
-
1e-6
)
||
((
fabs
(
temp
)
<
(
T
)
1e-6
)
&&
(
temp_dist_1
>
temp_dist_2
)))
{
((
T
*
)
ordered_pts_x
)[
ii_index
]
=
temp2_x
;
((
T
*
)
ordered_pts_y
)[
ii_index
]
=
temp2_y
;
((
T
*
)
ordered_pts_x
)[
jj_index
]
=
temp1_x
;
((
T
*
)
ordered_pts_y
)[
jj_index
]
=
temp1_y
;
((
T
*
)
dist_ram
)[
ii_index
]
=
temp_dist_2
;
((
T
*
)
dist_ram
)[
jj_index
]
=
temp_dist_1
;
}
}
}
// Step4:
// Make sure there are at least 2 points(that don't overlap with each
// other) in the stack
int
k
;
// index of the non-overlapped second point
for
(
k
=
1
;
k
<
temp_nums_in
;
k
++
)
{
if
(((
T
*
)
dist_ram
)[
k
*
actual_compute_box_num
+
i
]
>
(
T
)
1e-8
)
{
break
;
}
}
if
(
k
==
temp_nums_in
)
{
// We reach the end, which means the convex hull is just one point
// set valid_box = 0, to get ious = 0
((
T
*
)
valid_box
)[
i
]
=
0
;
continue
;
}
// q[1] = q[k];
((
T
*
)
ordered_pts_x
)[
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_x
)[
k
*
actual_compute_box_num
+
i
];
((
T
*
)
ordered_pts_y
)[
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_y
)[
k
*
actual_compute_box_num
+
i
];
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
int
m
=
2
;
// 2 points in the stack
for
(
int
j
=
k
+
1
;
j
<
temp_nums_in
;
j
++
)
{
// while (m > 1 && cross2d<T>(q[j] - q[m - 2], q[m - 1] - q[m - 2]) >=
// 0) {
// m--;
// }
temp1_x
=
((
T
*
)
ordered_pts_x
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp1_y
=
((
T
*
)
ordered_pts_y
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_x
=
((
T
*
)
ordered_pts_x
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_y
=
((
T
*
)
ordered_pts_y
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp
=
(
temp1_x
*
temp2_y
)
-
(
temp1_y
*
temp2_x
);
while
((
m
>
1
)
&&
(
temp
>=
0
))
{
m
--
;
if
(
m
>
1
)
{
temp1_x
=
((
T
*
)
ordered_pts_x
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp1_y
=
((
T
*
)
ordered_pts_y
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_x
=
((
T
*
)
ordered_pts_x
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_y
=
((
T
*
)
ordered_pts_y
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp
=
(
temp1_x
*
temp2_y
)
-
(
temp1_y
*
temp2_x
);
}
}
// q[m++] = q[j];
((
T
*
)
ordered_pts_x
)[
m
*
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_x
)[
j
*
actual_compute_box_num
+
i
];
((
T
*
)
ordered_pts_y
)[
m
*
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_y
)[
j
*
actual_compute_box_num
+
i
];
m
++
;
}
// set last(24-m) valid_pts to false, to erase invalid q in polygon area
for
(
int
j
=
m
;
j
<
temp_nums_in
;
j
++
)
{
((
T
*
)
valid_pts
)[
j
*
actual_compute_box_num
+
i
]
=
0
;
}
((
T
*
)
nums_in_ram
)[
i
]
=
m
;
}
}
}
template
<
typename
T
>
__mlu_func__
void
polygonArea
(
T
*
ordered_pts_x
,
T
*
ordered_pts_y
,
T
*
valid_box
,
T
*
valid_pts
,
T
*
nums_in_ram
,
T
*
temp1_ram
,
T
*
temp2_ram
,
T
*
temp3_ram
,
T
*
temp4_ram
,
T
*
temp5_ram
,
T
*
temp6_ram
,
T
*
temp7_ram
,
T
*
temp8_ram
,
T
*
temp9_ram
,
const
uint32_t
&
actual_compute_box_num
)
{
// Set where nums_in <= 2, valid_box = false
__bang_write_value
((
T
*
)
temp9_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
2
);
__bang_cycle_gt
((
T
*
)
temp1_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
valid_box
,
(
T
*
)
valid_box
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
// temp1 = area, initialize with all 0
__bang_write_zero
((
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_max
((
T
*
)
temp7_ram
,
(
T
*
)
nums_in_ram
,
actual_compute_box_num
);
// temp_nums_in = max(nums_in)
T
temp_nums_in
=
((
T
*
)
temp7_ram
)[
0
];
for
(
int
i
=
1
;
i
<
temp_nums_in
-
1
;
i
++
)
{
// q[i] - q[0]: (temp6, temp7)
__bang_sub
((
T
*
)
temp6_ram
,
(
T
*
)
ordered_pts_x
+
i
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
ordered_pts_y
+
i
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_y
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp6_ram
,
(
T
*
)
temp6_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp7_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
// q[i + 1] - q[0]: (temp8, temp9)
__bang_sub
((
T
*
)
temp8_ram
,
(
T
*
)
ordered_pts_x
+
(
i
+
1
)
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp9_ram
,
(
T
*
)
ordered_pts_y
+
(
i
+
1
)
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_y
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp8_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
// area += fabs(cross2d<T>(q[i] - q[0], q[i + 1] - q[0]));
__bang_mul
((
T
*
)
temp4_ram
,
(
T
*
)
temp6_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp5_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp3_ram
,
(
T
*
)
temp4_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
);
__bang_active_abs
((
T
*
)
temp3_ram
,
(
T
*
)
temp3_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp3_ram
,
actual_compute_box_num
);
}
// Set where valid_box = false, intersection = 0
__bang_mul
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
valid_box
,
actual_compute_box_num
);
// area = area / 2.0
__bang_mul_scalar
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
)
0.5
,
actual_compute_box_num
);
}
#endif // IOU3D_UTILS_HPP_
mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include <math.h>
/****************************************************************************************
*
* NRAM partition forward:
* | spatial_shapes | data_value_p1_ping | data_value_p2_ping |
* | data_value_p3_ping | data_value_p4_ping | data_col_ping |
* | data_value_p1_pong | data_value_p2_pong | data_value_p3_pong |
* | data_value_p4_pong | data_col_pong | auxiliary_a |
* | auxiliary_b |
* | 128bytes | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size |
*
****************************************************************************************/
/****************************************************************************************
*
* NRAM partition backward:
* | grad_output_nram | grad_output_nram_temp | grad_weight |
* | grad_h_weight | grad_w_weight | top_grad |
* | top_grad_temp | spatial_shapes_nram | sampling_loc_nram |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | 64bytes |
*
****************************************************************************************/
#define TWELVE_SPLIT 12
#define ALIGN_NUM 64
#define ALIGN_NUM_FOR_REDUCE 32
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void loadNeighborPointsData(
const T *data_value_gdram, T *data_value_p1_nram, T *data_value_p2_nram,
T *data_value_p3_nram, T *data_value_p4_nram, const size_t deal_num,
const int32_t &width, const int32_t &height, const int32_t &num_heads,
const int32_t &channels, const T &x, const T &y, const int32_t &head_idx) {
const int32_t w_low = floorf(x);
const int32_t h_low = floorf(y);
const int32_t w_high = w_low + 1;
const int32_t h_high = h_low + 1;
const int32_t w_stride = num_heads * channels;
const int32_t h_stride = width * w_stride;
const int32_t h_low_ptr_offset = h_low * h_stride;
const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int32_t w_low_ptr_offset = w_low * w_stride;
const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int32_t base_ptr_offset = head_idx * channels;
// top-left point
if (h_low >= 0 && w_low >= 0) {
const int32_t v1_offset =
h_low_ptr_offset + w_low_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p1_nram, data_value_gdram + v1_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
// top-right point
if (h_low >= 0 && w_high <= width - 1) {
const int32_t v2_offset =
h_low_ptr_offset + w_high_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p2_nram, data_value_gdram + v2_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
// bottom-left point
if (h_high <= height - 1 && w_low >= 0) {
const int32_t v3_offset =
h_high_ptr_offset + w_low_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p3_nram, data_value_gdram + v3_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
// bottom-right point
if (h_high <= height - 1 && w_high <= width - 1) {
const int32_t v4_offset =
h_high_ptr_offset + w_high_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p4_nram, data_value_gdram + v4_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
}
template <typename T>
__mlu_func__ void bilinearInterpolation(
T *data_value_p1_nram, T *data_value_p2_nram, T *data_value_p3_nram,
T *data_value_p4_nram, T *sample_point_value, T *auxiliary_b,
const size_t deal_num, const int32_t &width, const int32_t &height,
const T &x, const T &y) {
const int32_t w_low = floorf(x);
const int32_t h_low = floorf(y);
const int32_t w_high = w_low + 1;
const int32_t h_high = h_low + 1;
const T lw = x - w_low;
const T lh = y - h_low;
const T hw = 1 - lw;
const T hh = 1 - lh;
const T w1 = hh * hw;
const T w2 = hh * lw;
const T w3 = lh * hw;
const T w4 = lh * lw;
__bang_write_value((T *)sample_point_value, deal_num, (T)0);
// top-left point
if (h_low >= 0 && w_low >= 0) {
// sample_point_value += v1 * w1
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p1_nram, (T)w1,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
// top-right point
if (h_low >= 0 && w_high <= width - 1) {
// sample_point_value += v2 * w2
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p2_nram, (T)w2,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
// bottom-left point
if (h_high <= height - 1 && w_low >= 0) {
// sample_point_value += v3 * w3
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p3_nram, (T)w3,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
// bottom-right point
if (h_high <= height - 1 && w_high <= width - 1) {
// sample_point_value += v4 * w4
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p4_nram, (T)w4,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
}
template <typename T>
__mlu_global__ void MLUKernelMsDeformAttnForward(
const char *data_value_gdram, const char *data_spatial_shapes_gdram,
const char *data_level_start_index_gdram,
const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char *data_col_gdram) {
if (coreId == 0x80) {
return;
}
const size_t spatial_size = PAD_UP(2 * sizeof(int32_t), NFU_ALIGN_SIZE);
const size_t span_num_deal =
PAD_DOWN((MAX_NRAM_SIZE - spatial_size) / TWELVE_SPLIT / sizeof(T),
NFU_ALIGN_SIZE);
const size_t align_num = NFU_ALIGN_SIZE;
const int32_t channels_seg_num = channels / span_num_deal;
const size_t channels_rem = channels % span_num_deal;
const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num);
char *data_spatial_shapes_nram = nram_buffer;
char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size;
char *ping_data_value_p2_nram =
ping_data_value_p1_nram + span_num_deal * sizeof(T);
char *ping_data_value_p3_nram =
ping_data_value_p2_nram + span_num_deal * sizeof(T);
char *ping_data_value_p4_nram =
ping_data_value_p3_nram + span_num_deal * sizeof(T);
char *ping_data_col_nram =
ping_data_value_p4_nram + span_num_deal * sizeof(T);
char *pong_data_value_p1_nram =
ping_data_col_nram + span_num_deal * sizeof(T);
char *pong_data_value_p2_nram =
pong_data_value_p1_nram + span_num_deal * sizeof(T);
char *pong_data_value_p3_nram =
pong_data_value_p2_nram + span_num_deal * sizeof(T);
char *pong_data_value_p4_nram =
pong_data_value_p3_nram + span_num_deal * sizeof(T);
char *pong_data_col_nram =
pong_data_value_p4_nram + span_num_deal * sizeof(T);
char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T);
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T);
size_t data_col_ping_pong_idx = 0;
int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim;
const int32_t block_num_rem =
(batch_size * num_queries * num_heads) % taskDim;
const int32_t idx_start = taskId < (block_num_rem + 1)
? taskId * (block_num_per_core + 1)
: taskId * block_num_per_core + block_num_rem;
block_num_per_core =
taskId < block_num_rem
? (batch_size * num_queries * num_heads) / taskDim + 1
: (batch_size * num_queries * num_heads) / taskDim;
for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core;
++cur_idx) {
// cur_idx = batch_idx * num_queries * num_heads + query_idx * num_heads +
// head_idx
const int32_t head_idx = cur_idx % num_heads;
const int32_t batch_idx = (cur_idx / num_heads) / num_queries;
const char *data_value_gdram_start =
data_value_gdram +
batch_idx * num_keys * num_heads * channels * sizeof(T);
const char *data_sampling_loc_gdram_start =
data_sampling_loc_gdram +
cur_idx * num_levels * num_points * 2 * sizeof(T);
const char *data_attn_weight_gdram_start =
data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T);
char *data_col_gdram_start =
data_col_gdram + cur_idx * channels * sizeof(T);
for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) {
__bang_write_value(
(T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
span_num_deal, (T)0);
// load data
// level_idx = 0, point_idx = 0
__memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
2 * sizeof(int32_t), GDRAM2NRAM);
int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
const char *data_value_ptr =
data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T);
T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
T weight = ((T *)data_attn_weight_gdram_start)[0];
T x = loc_w * spatial_w - 0.5;
T y = loc_h * spatial_h - 0.5;
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr, (T *)ping_data_value_p1_nram,
(T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
(T *)ping_data_value_p4_nram, span_num_deal, spatial_w, spatial_h,
num_heads, channels, x, y, head_idx);
}
T spatial_h_next_point = 0;
T spatial_w_next_point = 0;
T weight_next_point = 0;
T x_next_point = 0;
T y_next_point = 0;
__asm__ volatile("sync;");
for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
// load data
if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
// last point no need to load data, continue to compute
} else if (point_idx == num_points - 1) {
const int32_t level_start_id =
((int32_t *)data_level_start_index_gdram)[level_idx + 1];
const int32_t spatial_h_ptr = (level_idx + 1) << 1;
__memcpy(
data_spatial_shapes_nram,
data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
2 * sizeof(int32_t), GDRAM2NRAM);
spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
data_value_ptr = data_value_gdram_start +
(level_start_id * num_heads * channels +
c_seg_idx * span_num_deal) *
sizeof(T);
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w_next_point - 0.5;
y_next_point = loc_h * spatial_h_next_point - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h_next_point &&
x_next_point < spatial_w_next_point) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
span_num_deal, spatial_w_next_point, spatial_h_next_point,
num_heads, channels, x_next_point, y_next_point, head_idx);
}
} else {
spatial_h_next_point = spatial_h;
spatial_w_next_point = spatial_w;
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w - 0.5;
y_next_point = loc_h * spatial_h - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h && x_next_point < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
span_num_deal, spatial_w, spatial_h, num_heads, channels,
x_next_point, y_next_point, head_idx);
}
}
// compute
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
bilinearInterpolation(
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)auxiliary_a, (T *)auxiliary_b, span_num_deal, spatial_w,
spatial_h, x, y);
__bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
span_num_deal);
__bang_add((T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)auxiliary_a, span_num_deal);
}
spatial_w = spatial_w_next_point;
spatial_h = spatial_h_next_point;
weight = weight_next_point;
x = x_next_point;
y = y_next_point;
__asm__ volatile("sync;");
}
}
// store
__memcpy_async(
data_col_gdram_start + c_seg_idx * span_num_deal * sizeof(T),
ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
span_num_deal * sizeof(T), NRAM2GDRAM);
data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
}
if (channels_rem > 0) {
__bang_write_value(
(T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
channels_align_rem, (T)0);
// load data
// level_idx = 0, point_idx = 0
__memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
2 * sizeof(int32_t), GDRAM2NRAM);
int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
const char *data_value_ptr =
data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T);
T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
T weight = ((T *)data_attn_weight_gdram_start)[0];
T x = loc_w * spatial_w - 0.5;
T y = loc_h * spatial_h - 0.5;
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr, (T *)ping_data_value_p1_nram,
(T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
(T *)ping_data_value_p4_nram, channels_rem, spatial_w, spatial_h,
num_heads, channels, x, y, head_idx);
}
T spatial_h_next_point = 0;
T spatial_w_next_point = 0;
T weight_next_point = 0;
T x_next_point = 0;
T y_next_point = 0;
__asm__ volatile("sync;");
for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
// load data
if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
// last point no need to load data, continue to compute
} else if (point_idx == num_points - 1) {
const int32_t level_start_id =
((int32_t *)data_level_start_index_gdram)[level_idx + 1];
const int32_t spatial_h_ptr = (level_idx + 1) << 1;
__memcpy(
data_spatial_shapes_nram,
data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
2 * sizeof(int32_t), GDRAM2NRAM);
spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
data_value_ptr = data_value_gdram_start +
(level_start_id * num_heads * channels +
channels_seg_num * span_num_deal) *
sizeof(T);
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w_next_point - 0.5;
y_next_point = loc_h * spatial_h_next_point - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h_next_point &&
x_next_point < spatial_w_next_point) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
channels_rem, spatial_w_next_point, spatial_h_next_point,
num_heads, channels, x_next_point, y_next_point, head_idx);
}
} else {
spatial_w_next_point = spatial_w;
spatial_h_next_point = spatial_h;
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w - 0.5;
y_next_point = loc_h * spatial_h - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h && x_next_point < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
channels_rem, spatial_w, spatial_h, num_heads, channels,
x_next_point, y_next_point, head_idx);
}
}
// compute
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
bilinearInterpolation(
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)auxiliary_a, (T *)auxiliary_b, channels_align_rem,
spatial_w, spatial_h, x, y);
__bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
channels_align_rem);
__bang_add((T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)auxiliary_a, channels_align_rem);
}
spatial_w = spatial_w_next_point;
spatial_h = spatial_h_next_point;
weight = weight_next_point;
x = x_next_point;
y = y_next_point;
__asm__ volatile("sync;");
}
}
// store
__memcpy_async(
data_col_gdram_start + channels_seg_num * span_num_deal * sizeof(T),
ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
channels_rem * sizeof(T), NRAM2GDRAM);
data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
}
}
__asm__ volatile("sync;");
return;
}
template __mlu_global__ void MLUKernelMsDeformAttnForward<float>(
const char *data_value_gdram, const char *data_spatial_shapes_gdram,
const char *data_level_start_index_gdram,
const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char *data_col_gdram);
void KernelMsDeformAttnForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const char *data_value_gdram,
const char *data_spatial_shapes_gdram,
const char *data_level_start_index_gdram,
const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char *data_col_gdram) {
MLUKernelMsDeformAttnForward<float><<<k_dim, k_type, queue>>>(
data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram,
data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys,
num_heads, channels, num_levels, num_queries, num_points, data_col_gdram);
}
template <typename T>
void __mlu_func__ msDeformAttnCol2imBilinear(
T *top_grad_temp, const int32_t &height, const int32_t &width, const T &w1,
const T &w2, const T &w3, const T &w4, const int32_t &h_low,
const int32_t &w_low, const int32_t &h_high, const int32_t &w_high,
const int32_t &base_ptr, const int32_t &h_low_ptr_offset,
const int32_t &w_low_ptr_offset, const int32_t &h_high_ptr_offset,
const int32_t &w_high_ptr_offset, const T &hh, const T &hw, const T &lh,
const T &lw, T *top_grad, const T &data_attn_weight, T *grad_h_weight,
T *grad_w_weight, T *grad_value, T *grad_output_nram, T *grad_weight,
T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp,
const int32_t &deal_num, const int32_t &deal_num_real,
const T *data_value_ptr) {
if (h_low >= 0 && w_low >= 0) {
int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
__memcpy(grad_output_nram, data_value_ptr + offset1,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram, hw, deal_num);
__bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram, hh, deal_num);
__bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w1, deal_num);
// for calc grad_attn_weight
__bang_mul_scalar(grad_output_nram, grad_output_nram, w1, deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset1),
(T *)top_grad_temp, deal_num_real);
}
if (h_low >= 0 && w_high <= width - 1) {
int32_t offset2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
__memcpy(grad_output_nram_temp, data_value_ptr + offset2,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
__bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, hh, deal_num);
__bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w2, deal_num);
__bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w2,
deal_num);
__bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset2),
(T *)top_grad_temp, deal_num_real);
}
if (h_high <= height - 1 && w_low >= 0) {
int32_t offset3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
__memcpy(grad_output_nram_temp, data_value_ptr + offset3,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, hw, deal_num);
__bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
__bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w3, deal_num);
// for calc grad_attn_weight
__bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w3,
deal_num);
__bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset3),
(T *)top_grad_temp, deal_num_real);
}
if (h_high <= height - 1 && w_high <= width - 1) {
int32_t offset4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
__memcpy(grad_output_nram_temp, data_value_ptr + offset4,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
__bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
__bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w4, deal_num);
// for calc grad_attn_weight
__bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w4,
deal_num);
__bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset4),
(T *)top_grad_temp, deal_num_real);
}
__bang_mul(grad_output_nram, grad_output_nram, top_grad, deal_num);
#if __BANG_ARCH__ >= 322
recursiveSumPool(grad_output_nram, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
#else
const int32_t align_num_on_200 = NFU_ALIGN_SIZE / sizeof(float);
recursiveSumPool(grad_output_nram, align_num_on_200,
deal_num / align_num_on_200, ALIGN_NUM_FOR_REDUCE);
__bang_reduce_sum(grad_output_nram, grad_output_nram,
NFU_ALIGN_SIZE / sizeof(float));
#endif
__bang_atomic_add((T *)grad_output_nram, (T *)grad_attn_weight,
(T *)grad_output_nram, 1);
__bang_mul_scalar(grad_w_weight, grad_w_weight, width, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul(grad_w_weight, grad_w_weight, top_grad_temp, deal_num);
#if __BANG_ARCH__ >= 322
recursiveSumPool(grad_w_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
#else
recursiveSumPool(grad_w_weight, align_num_on_200, deal_num / align_num_on_200,
ALIGN_NUM_FOR_REDUCE);
__bang_reduce_sum(grad_w_weight, grad_w_weight,
NFU_ALIGN_SIZE / sizeof(float));
#endif
__bang_atomic_add((T *)grad_w_weight, (T *)(grad_sampling_loc),
(T *)grad_w_weight, 1);
__bang_mul_scalar(grad_h_weight, grad_h_weight, height, deal_num);
__bang_mul(grad_h_weight, grad_h_weight, top_grad_temp, deal_num);
#if __BANG_ARCH__ >= 322
recursiveSumPool(grad_h_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
#else
recursiveSumPool(grad_h_weight, align_num_on_200, deal_num / align_num_on_200,
ALIGN_NUM_FOR_REDUCE);
__bang_reduce_sum(grad_h_weight, grad_h_weight,
NFU_ALIGN_SIZE / sizeof(float));
#endif
__bang_atomic_add((T *)grad_h_weight, (T *)(grad_sampling_loc + 1),
(T *)grad_h_weight, 1);
}
__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
const float *data_value, const int32_t *spatial_shapes,
const int32_t *data_level_start_index, const float *data_sampling_loc,
const float *data_attn_weight, const float *grad_output,
const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_query,
const int32_t num_points, float *grad_value, float *grad_sampling_loc,
float *grad_attn_weight) {
if (coreId == 0x80) {
return;
}
const int32_t split_num = 8;
const int32_t spatial_shapes_size = 64;
int32_t deal_num = PAD_DOWN(
(MAX_NRAM_SIZE - spatial_shapes_size) / split_num / sizeof(float),
ALIGN_NUM);
float *grad_output_nram = (float *)nram_buffer;
float *grad_output_nram_temp = (float *)nram_buffer + deal_num;
float *grad_weight = (float *)nram_buffer + 2 * deal_num;
float *grad_h_weight = (float *)nram_buffer + 3 * deal_num;
float *grad_w_weight = (float *)nram_buffer + 4 * deal_num;
float *top_grad = (float *)nram_buffer + 5 * deal_num;
float *top_grad_temp = (float *)nram_buffer + 6 * deal_num;
int32_t *spatial_shapes_nram =
(int32_t *)((float *)nram_buffer + 7 * deal_num);
float *sampling_loc_nram =
(float *)nram_buffer + 7 * deal_num + 2 * sizeof(int32_t);
const int32_t total_num = batch * num_query * num_heads * num_levels;
int32_t num_per_core = total_num / taskDim;
int32_t num_rem = total_num % taskDim;
num_per_core = num_per_core + int32_t(taskId < num_rem);
int32_t start_per_core =
num_rem > taskId
? (taskId * num_per_core)
: ((num_per_core + 1) * num_rem + (taskId - num_rem) * num_per_core);
int32_t end_per_core = start_per_core + num_per_core;
const int32_t C_repeat = channels / deal_num;
const int32_t C_tail = channels % deal_num;
const int32_t qid_stride = num_heads * channels;
int32_t base_ptr = 0;
for (int32_t num_loop = start_per_core; num_loop < end_per_core; ++num_loop) {
const int32_t l_col = num_loop % num_levels;
const int32_t m_col = num_loop / num_levels % num_heads;
const int32_t q_col = num_loop / num_levels / num_heads % num_query;
const int32_t b_col = num_loop / num_query / num_heads / num_levels;
int32_t data_weight_ptr = num_loop * num_points;
int32_t data_loc_w_ptr = data_weight_ptr << 1;
const int32_t value_offset = b_col * spatial_size * num_heads * channels;
const int32_t level_start_id = data_level_start_index[l_col];
int32_t spatial_h_ptr = l_col << 1;
int32_t grad_output_offset = b_col * num_query * num_heads * channels +
q_col * num_heads * channels +
m_col * channels;
__memcpy(spatial_shapes_nram, spatial_shapes + spatial_h_ptr,
2 * sizeof(int32_t), GDRAM2NRAM);
const int32_t spatial_h = spatial_shapes_nram[0];
const int32_t spatial_w = spatial_shapes_nram[1];
const int32_t value_ptr_offset = value_offset + level_start_id * qid_stride;
const float *data_value_ptr = data_value + value_ptr_offset;
float *grad_value_ptr = grad_value + value_ptr_offset;
const int32_t grad_attn_weight_out = num_loop * num_points;
const int32_t grad_sampling_loc_out = num_loop * num_points * 2;
for (int32_t p_col = 0; p_col < num_points; ++p_col) {
__memcpy(sampling_loc_nram, data_sampling_loc + data_loc_w_ptr,
2 * sizeof(float), GDRAM2NRAM);
const float loc_w = sampling_loc_nram[0];
const float loc_h = sampling_loc_nram[1];
const float weight = data_attn_weight[data_weight_ptr];
const float h_im = loc_h * spatial_h - 0.5;
const float w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
const int32_t h_low = floorf(h_im);
const int32_t w_low = floorf(w_im);
const int32_t h_high = h_low + 1;
const int32_t w_high = w_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1.0 - lh;
const float hw = 1.0 - lw;
const int32_t w_stride = num_heads * channels;
const int32_t h_stride = spatial_w * w_stride;
const int32_t h_low_ptr_offset = h_low * h_stride;
const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int32_t w_low_ptr_offset = w_low * w_stride;
const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
float w1 = hh * hw;
float w2 = hh * lw;
float w3 = lh * hw;
float w4 = lh * lw;
for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) {
base_ptr = m_col * channels + C_loop * deal_num;
__bang_write_zero(grad_weight, 3 * deal_num);
__bang_write_zero(grad_output_nram, deal_num);
__memcpy(top_grad,
grad_output + grad_output_offset + C_loop * deal_num,
deal_num * sizeof(float), GDRAM2NRAM);
msDeformAttnCol2imBilinear(
top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
weight, grad_h_weight, grad_w_weight, grad_value_ptr,
grad_output_nram, grad_weight,
grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
grad_attn_weight + grad_attn_weight_out + p_col,
grad_output_nram_temp, deal_num, deal_num, data_value_ptr);
}
if (C_tail != 0) {
base_ptr = m_col * channels + C_repeat * deal_num;
__bang_write_zero(grad_output_nram, 8 * deal_num);
__memcpy(top_grad,
grad_output + grad_output_offset + C_repeat * deal_num,
C_tail * sizeof(float), GDRAM2NRAM);
msDeformAttnCol2imBilinear(
top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
weight, grad_h_weight, grad_w_weight, grad_value_ptr,
grad_output_nram, grad_weight,
grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
grad_attn_weight + grad_attn_weight_out + p_col,
grad_output_nram_temp, deal_num, C_tail, data_value_ptr);
}
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
}
__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
const float *data_value, const int32_t *spatial_shapes,
const int32_t *data_level_start_index, const float *data_sampling_loc,
const float *data_attn_weight, const float *grad_output,
const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_query,
const int32_t num_points, float *grad_value, float *grad_sampling_loc,
float *grad_attn_weight);
void KernelMsDeformAttnBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const float *data_value,
const int32_t *spatial_shapes, const int32_t *data_level_start_index,
const float *data_sampling_loc, const float *data_attn_weight,
const float *grad_output, const int32_t batch, const int32_t spatial_size,
const int32_t num_heads, const int32_t channels, const int32_t num_levels,
const int32_t num_query, const int32_t num_points, float *grad_value,
float *grad_sampling_loc, float *grad_attn_weight) {
MLUUnion1KernelMsDeformAttnBackward<<<k_dim, k_type, queue>>>(
data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
data_attn_weight, grad_output, batch, spatial_size, num_heads, channels,
num_levels, num_query, num_points, grad_value, grad_sampling_loc,
grad_attn_weight);
}
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "nms_utils.hpp"
#define COORD_DIM (4)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
enum Addr { SRAM, GDRAM };
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection(
uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
IN_DT *sram, const int core_limit, const int input_num_boxes,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float offset, const int algo) {
// global value
int32_t *exit_flag = (int32_t *)(sram + 28);
exit_flag[0] = 0;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
const IN_DT *input_x1_ptr = input_data_box;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int input_offset = 0; // offset of input_data for current core
int nram_save_count = 0;
if (output_mode == 0) {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
// 5 maens: score, x1, y1, x2, y2
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * 5 * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
int max_seg_iou_compute = 0;
int repeat_iou_compute = 0;
int remain_iou_compute = 0;
int remain_pad_iou_compute = 0;
getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
input_offset, max_seg_pad, repeat, remain,
remain_pad, max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the data ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
if (core_limit != 1) {
__sync_cluster(); // sync before current loop
}
/******FIND MAX START******/
int max_index = 0; // the max score index
int global_max_index = 0; // for U1
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
input_offset, repeat, remain, remain_pad, max_seg_pad,
max_index);
if (core_limit == 1) {
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
input_data_score[max_index] = 0;
global_max_index = max_index;
} else if (core_limit == 4) {
__sync_cluster();
findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
global_max_index = ((uint32_t *)(max_box + 5))[0];
input_data_score[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
max_output_size, thresh_score, output_mode, nram_save_count,
output_box_num);
// if the max score <= 0, end
if (core_limit == 1) {
if (float(max_box[0]) <= thresh_score) {
break;
}
} else {
if (float(max_box[0]) <= thresh_score) {
if (coreId == 0) {
exit_flag[0] = 1;
}
}
__sync_cluster();
if (exit_flag[0] == 1) {
break;
}
}
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
max_box_y1, max_box_x2, max_box_y2, nram_save,
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
input_offset, offset, max_area, input_num_boxes, algo);
#else
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
max_box[2], max_box[3], max_box[4], nram_save,
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
input_offset, offset, max_area, input_num_boxes, algo);
#endif
} // for max_output_size
}
__mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int max_output_size,
const float iou_threshold, const float confidence_threshold,
const int output_mode, void *workspace, void *result_num, void *output,
const cnrtDataType_t data_type_input, const float offset, const int algo) {
if (data_type_input == CNRT_FLOAT16) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
GDRAM2GDRAM);
} else if (data_type_input == CNRT_FLOAT32) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
GDRAM2GDRAM);
} else {
}
uint32_t output_box_num = 0;
float *score_data = (float *)workspace;
float *boxes_data = (float *)input_boxes;
float *sram = (float *)sram_buffer;
if (output_mode == 0) {
if (data_type_input == CNRT_FLOAT32) {
nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, algo);
} else {
nms_detection(output_box_num, output_mode, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
} else {
if (data_type_input == CNRT_FLOAT32) {
nms_detection(output_box_num, output_mode, (float *)output, score_data,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, algo);
} else {
nms_detection(output_box_num, output_mode, (half *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux(
int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_num_boxes, const int max_output_size,
const float thresh_iou, const float thresh_score, const float offset,
const int output_mode, const int algo, char *cdma_gdram) {
exit_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
const IN_DT *input_x1_ptr = boxes_data;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int nram_save_count = 0;
if (output_mode == 0) {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
int input_offset = 0;
int max_seg_iou_compute = 0;
int repeat_iou_compute = 0;
int remain_iou_compute = 0;
int remain_pad_iou_compute = 0;
getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
max_seg_pad, repeat, remain, remain_pad,
max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
__sync_all();
int max_index = 0;
int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
if (coreId == 0) {
findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
input_offset, repeat, remain, remain_pad, max_seg_pad,
max_index);
// copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_all();
#if __BANG_ARCH__ >= 590
__memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram,
REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM);
__sync_all();
if (clusterId == 0 && coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM,
sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy((char *)cdma_gdram,
(char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM);
}
__sync_all();
__memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM);
#else
findGlobalMaxBox(max_box, sram, inter_x1);
#endif
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId != MEMORY_CORE) {
score_data[global_max_index] = 0;
}
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
max_output_size, thresh_score, output_mode, nram_save_count,
output_box_num);
if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) {
exit_flag[0] = 1; // dram
}
}
__sync_all();
if (exit_flag[0] == 1) {
break;
}
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
max_area, input_num_boxes, algo);
#else
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
max_box[3], max_box[4], nram_save, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
max_area, input_num_boxes, algo);
#endif
} // for max_output_size
}
__mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int max_output_size,
const float iou_threshold, const float confidence_threshold,
const float offset, const cnrtDataType_t data_type_input,
const int output_mode, const int algo, void *workspace, void *result_num,
void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *exit_flag = (int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
char *cdma_addr = (char *)exit_flag + sizeof(int32_t);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
int cluster_score_size = input_num_boxes * input_dwidth;
int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
char *sram_score = (char *)sram_buffer + reduce_sram_size;
char *sram_boxes =
(char *)sram_buffer + reduce_sram_size + cluster_score_size;
Addr input_ram = GDRAM;
if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
input_ram = SRAM;
__memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
__memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
} else {
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
}
__sync_cluster();
uint32_t output_box_num = 0;
float *score_data;
float *boxes_data;
score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
if (output_mode == 0) {
if (data_type_input == CNRT_FLOAT32) {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
score_data, boxes_data, input_ram, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, output_mode, algo, cdma_addr);
} else {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, input_ram,
input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo,
cdma_addr);
}
} else {
if (data_type_input == CNRT_FLOAT32) {
nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
boxes_data, input_ram, input_num_boxes, max_output_size,
iou_threshold, confidence_threshold, offset, output_mode,
algo, cdma_addr);
} else {
nms_detection_ux(exit_flag, output_box_num, (half *)output,
(half *)score_data, (half *)boxes_data, input_ram,
input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo,
cdma_addr);
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int max_output_boxes, const float iou_threshold,
const float offset, void *workspace_ptr, void *output_size_ptr,
void *output_ptr) {
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
}; break;
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION4:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
output_size_ptr, output_ptr);
}; break;
}
}
mmcv/ops/csrc/common/mlu/nms_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) [2019-2022] by Cambricon, Inc.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef NMS_UTILS_HPP_
#define NMS_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define NMS_SIZE (64)
#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define NMS_DOWN(x, y) (x / y) * y
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define MEMORY_CORE (0x80)
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
__mlu_func__
void
pvLock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_lock
(
0
,
0
);
}
#endif
}
__mlu_func__
void
pvUnlock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_unlock
(
0
,
0
);
}
#endif
}
template
<
typename
T
>
static
__mlu_func__
void
computeReluN
(
T
*
nram_dst
,
T
*
nram_src
,
void
*
nram_tmp
,
const
int
deal_num
,
const
T
threshold
=
0
)
{
if
(
threshold
<
0
)
{
return
;
}
if
(
threshold
)
{
#if __BANG_ARCH__ >= 300
__bang_relun
(
nram_dst
,
nram_src
,
deal_num
,
threshold
);
#else
int
align_num
=
NFU_ALIGN_SIZE
/
sizeof
(
T
);
T
*
nram_aux_a
=
(
T
*
)
nram_tmp
;
T
*
nram_aux_b
=
nram_aux_a
+
deal_num
;
T
*
nram_zero
=
nram_aux_b
+
align_num
;
__bang_write_value
(
nram_aux_b
,
align_num
,
threshold
);
__bang_write_zero
(
nram_zero
,
align_num
);
__bang_cycle_lt
((
T
*
)
nram_aux_a
,
nram_src
,
(
T
*
)
nram_aux_b
,
deal_num
,
align_num
);
__bang_mul
(
nram_dst
,
nram_src
,
(
T
*
)
nram_aux_a
,
deal_num
);
__bang_cycle_eq
((
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_a
,
(
T
*
)
nram_zero
,
deal_num
,
align_num
);
__bang_cycle_mul
((
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_b
,
deal_num
,
align_num
);
__bang_add
(
nram_dst
,
nram_dst
,
(
T
*
)
nram_aux_a
,
deal_num
);
__bang_cycle_gt
((
T
*
)
nram_aux_a
,
nram_dst
,
(
T
*
)
nram_zero
,
deal_num
,
align_num
);
__bang_mul
(
nram_dst
,
nram_dst
,
(
T
*
)
nram_aux_a
,
deal_num
);
#endif
}
else
{
#if __BANG_ARCH__ >= 300
__bang_relu
(
nram_dst
,
nram_src
,
deal_num
);
#else
__bang_active_relu
(
nram_dst
,
nram_src
,
deal_num
);
#endif
}
}
__mlu_func__
void
getComputeParamsBlockOrU1
(
const
int
input_dwidth
,
const
int
input_box_num
,
const
int
limit
,
const
int
core_limit
,
int
&
input_offset
,
int
&
max_seg_pad
,
int
&
repeat
,
int
&
remain
,
int
&
remain_pad
,
int
&
max_seg_iou_compute
,
int
&
repeat_iou_compute
,
int
&
remain_iou_compute
,
int
&
remain_pad_iou_compute
)
{
int
avg_core
=
input_box_num
/
core_limit
;
int
rem
=
input_box_num
%
core_limit
;
int
len_core
=
avg_core
+
(
coreId
<
rem
?
1
:
0
);
input_offset
=
avg_core
*
coreId
+
(
coreId
<=
rem
?
coreId
:
rem
);
max_seg_pad
=
NMS_DOWN
(
limit
,
NMS_SIZE
);
repeat
=
len_core
/
max_seg_pad
;
remain
=
len_core
%
max_seg_pad
;
remain_pad
=
NMS_UP
(
remain
,
NMS_SIZE
);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute
=
NMS_DOWN
(
max_seg_pad
/
(
4
/
input_dwidth
),
NMS_SIZE
);
repeat_iou_compute
=
len_core
/
max_seg_iou_compute
;
remain_iou_compute
=
len_core
%
max_seg_iou_compute
;
remain_pad_iou_compute
=
NMS_UP
(
remain_iou_compute
,
NMS_SIZE
);
}
__mlu_func__
void
getComputeParamsUx
(
const
int
input_dwidth
,
const
int
input_num_boxes
,
const
int
limit
,
int
&
input_offset
,
int
&
max_seg_pad
,
int
&
repeat
,
int
&
remain
,
int
&
remain_pad
,
int
&
max_seg_iou_compute
,
int
&
repeat_iou_compute
,
int
&
remain_iou_compute
,
int
&
remain_pad_iou_compute
)
{
// data split
int
avg_cluster
=
input_num_boxes
/
clusterDim
;
int
rem_cluster
=
input_num_boxes
%
clusterDim
;
int
len_cluster
=
avg_cluster
+
(
clusterId
<
rem_cluster
);
int
cluster_offset
=
avg_cluster
*
clusterId
+
(
clusterId
<=
rem_cluster
?
clusterId
:
rem_cluster
);
int
avg_core
=
len_cluster
/
coreDim
;
int
rem_core
=
len_cluster
%
coreDim
;
int
len_core
=
avg_core
+
(
coreId
<
rem_core
);
int
core_offset
=
avg_core
*
coreId
+
(
coreId
<=
rem_core
?
coreId
:
rem_core
);
input_offset
=
cluster_offset
+
core_offset
;
max_seg_pad
=
NMS_DOWN
(
limit
,
NMS_SIZE
);
// core 0 of each cluster calculate the max score index
int
max_index_len_core
=
avg_cluster
+
(
clusterId
<
rem_cluster
);
repeat
=
max_index_len_core
/
max_seg_pad
;
remain
=
max_index_len_core
%
max_seg_pad
;
remain_pad
=
NMS_UP
(
remain
,
NMS_SIZE
);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute
=
NMS_DOWN
(
max_seg_pad
/
(
sizeof
(
float
)
/
input_dwidth
),
NMS_SIZE
);
repeat_iou_compute
=
len_core
/
max_seg_iou_compute
;
remain_iou_compute
=
len_core
%
max_seg_iou_compute
;
remain_pad_iou_compute
=
NMS_UP
(
remain_iou_compute
,
NMS_SIZE
);
}
template
<
typename
IN_DT
>
__mlu_func__
void
findGlobalMaxBox
(
IN_DT
*
max_box
,
IN_DT
*
sram
,
IN_DT
*
inter_x1
)
{
// copy all partial max to the sram of cluster 0
if
(
clusterId
!=
0
)
{
__memcpy
(
sram
+
REDUCE_NUM
*
clusterId
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2SRAM
,
0
);
}
__sync_all
();
// reduce between clusters to get the global max box
if
(
clusterId
==
0
)
{
if
(
coreId
==
0
)
{
__bang_write_zero
(
inter_x1
,
NMS_SIZE
);
__memcpy
(
inter_x1
,
sram
,
sizeof
(
IN_DT
),
SRAM2NRAM
,
sizeof
(
IN_DT
),
REDUCE_NUM
*
sizeof
(
IN_DT
),
clusterDim
-
1
);
__bang_max
(
max_box
,
inter_x1
,
NMS_SIZE
);
int
max_cluster
=
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
?
((
uint16_t
*
)
max_box
)[
1
]
:
((
uint32_t
*
)
max_box
)[
1
];
__memcpy
(
max_box
,
sram
+
max_cluster
*
REDUCE_NUM
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
__memcpy
(
sram
,
max_box
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
NRAM2SRAM
);
}
__sync_cluster
();
if
(
coreId
==
0x80
&&
clusterDim
>
1
)
{
// broadcast global max box to each cluster's sram
for
(
int
cluster_idx
=
1
;
cluster_idx
<
clusterDim
;
++
cluster_idx
)
{
__memcpy
(
sram
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2SRAM
,
cluster_idx
);
}
}
__sync_cluster
();
}
__sync_all
();
// copy the global max box to max_box
__memcpy
(
max_box
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
}
template
<
typename
IN_DT
>
__mlu_func__
void
findCoreMaxBox
(
IN_DT
*
input_score_ptr
,
IN_DT
*
score
,
IN_DT
*
inter_x1
,
IN_DT
*
max_box
,
const
IN_DT
*
input_x1_ptr
,
const
IN_DT
*
input_y1_ptr
,
const
IN_DT
*
input_x2_ptr
,
const
IN_DT
*
input_y2_ptr
,
const
mluMemcpyDirection_t
load_dir
,
const
int
input_offset
,
const
int
repeat
,
const
int
remain
,
const
int
remain_pad
,
const
int
max_seg_pad
,
int
&
max_index
)
{
if
(
coreId
!=
0x80
)
{
for
(
int
i
=
0
;
i
<=
repeat
;
i
++
)
{
if
(
i
==
repeat
&&
remain
==
0
)
{
break
;
}
int
seg_len
=
0
;
// the length every nms compute
int
cpy_len
=
0
;
// the length every nms memcpy
i
==
repeat
?
seg_len
=
remain_pad
:
seg_len
=
max_seg_pad
;
i
==
repeat
?
cpy_len
=
remain
:
cpy_len
=
max_seg_pad
;
/******NMS LOAD START******/
__bang_write_zero
(
score
,
seg_len
);
__memcpy
(
score
,
input_score_ptr
+
input_offset
+
i
*
max_seg_pad
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
/******NMS LOAD END******/
__bang_max
(
inter_x1
,
score
,
seg_len
);
if
(
inter_x1
[
0
]
>
max_box
[
0
])
{
max_box
[
0
]
=
inter_x1
[
0
];
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
max_index
=
((
uint16_t
*
)
inter_x1
)[
1
]
+
input_offset
+
i
*
max_seg_pad
;
// offset start from head of input_data
}
else
if
(
sizeof
(
IN_DT
)
==
sizeof
(
float
))
{
max_index
=
((
uint32_t
*
)
inter_x1
)[
1
]
+
input_offset
+
i
*
max_seg_pad
;
// offset start from head of input_data
}
}
}
// for repeat
// the max box's x1, y1, x2, y2 on every core
max_box
[
1
]
=
input_x1_ptr
[
max_index
];
max_box
[
2
]
=
input_y1_ptr
[
max_index
];
max_box
[
3
]
=
input_x2_ptr
[
max_index
];
max_box
[
4
]
=
input_y2_ptr
[
max_index
];
((
uint32_t
*
)(
max_box
+
5
))[
0
]
=
max_index
;
}
}
template
<
typename
IN_DT
>
__mlu_func__
void
findClusterMaxBox
(
IN_DT
*
sram
,
IN_DT
*
max_box
,
IN_DT
*
inter_x1
,
IN_DT
*
input_data_score
,
const
int
core_limit
)
{
// find the max with sram
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
__memcpy
(
sram
+
REDUCE_NUM
*
coreId
,
max_box
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
NRAM2SRAM
);
// int32_t datatype
__sync_cluster
();
// copy score from sram to nram and find the max
__bang_write_zero
(
inter_x1
,
64
);
__memcpy
(
inter_x1
,
sram
,
sizeof
(
IN_DT
),
SRAM2NRAM
,
sizeof
(
IN_DT
),
REDUCE_NUM
*
sizeof
(
IN_DT
),
coreDim
-
1
);
__bang_max
(
max_box
,
inter_x1
,
64
);
int
max_core
=
sizeof
(
IN_DT
)
==
sizeof
(
half
)
?
((
uint16_t
*
)
max_box
)[
1
]
:
((
uint32_t
*
)
max_box
)[
1
];
// copy the max box to max_box
__memcpy
(
max_box
,
sram
+
max_core
*
REDUCE_NUM
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
}
/*****************************************************************************/
/*******************************CALCULATE MAX AREA****************************/
/*****************************************************************************/
template
<
typename
IN_DT
>
__mlu_func__
void
calMaxArea
(
IN_DT
*
max_box
,
const
int
algo
,
float
offset
,
float
&
max_area
)
{
if
(
algo
==
0
||
offset
==
0.0
)
{
max_area
=
((
float
)
max_box
[
3
]
-
(
float
)
max_box
[
1
])
*
((
float
)
max_box
[
4
]
-
(
float
)
max_box
[
2
]);
}
else
{
max_area
=
((
float
)
max_box
[
3
]
-
(
float
)
max_box
[
1
]
+
offset
)
*
((
float
)
max_box
[
4
]
-
(
float
)
max_box
[
2
]
+
offset
);
}
}
template
<
typename
IN_DT
>
__mlu_func__
void
calMaxArea
(
IN_DT
*
max_box
,
const
int
algo
,
float
offset
,
float
&
max_area
,
float
&
max_box_x1
,
float
&
max_box_y1
,
float
&
max_box_x2
,
float
&
max_box_y2
)
{
// the case of random inf will break the requirement of x1<=x2, y1<=y2
// so exchange it if it happens.
max_box_x1
=
float
(
max_box
[
1
]);
max_box_x2
=
float
(
max_box
[
3
]);
if
(
max_box
[
1
]
>
max_box
[
3
])
{
max_box_x1
=
float
(
max_box
[
3
]);
max_box_x2
=
float
(
max_box
[
1
]);
}
max_box_y1
=
float
(
max_box
[
2
]);
max_box_y2
=
float
(
max_box
[
4
]);
if
(
max_box
[
2
]
>
max_box
[
4
])
{
max_box_y1
=
float
(
max_box
[
4
]);
max_box_y2
=
float
(
max_box
[
2
]);
}
if
(
algo
==
0
||
offset
==
0.0
)
{
max_area
=
(
max_box_x2
-
max_box_x1
)
*
(
max_box_y2
-
max_box_y1
);
}
else
{
max_area
=
(
max_box_x2
-
max_box_x1
+
offset
)
*
(
max_box_y2
-
max_box_y1
+
offset
);
}
}
/***********************************************************************/
/*******************************STORE RESULT****************************/
/***********************************************************************/
template
<
typename
IN_DT
,
typename
OUT_DT
>
__mlu_func__
void
storeResult
(
IN_DT
*
max_box
,
OUT_DT
*
nram_save
,
OUT_DT
*&
output_dram
,
const
int
keep
,
const
int
nram_save_limit_count
,
const
int
max_output_size
,
const
float
thresh_score
,
const
int
output_mode
,
int
&
nram_save_count
,
uint32_t
&
output_box_num
)
{
/******NMS STORE START******/
// store to nram
if
(
float
(
max_box
[
0
])
>
thresh_score
)
{
OUT_DT
*
save_ptr
;
int
save_offset
=
0
;
int
save_str_num
=
0
;
save_ptr
=
nram_save
;
save_offset
=
nram_save_count
;
save_str_num
=
nram_save_limit_count
;
if
(
clusterId
==
0
&&
coreId
==
0
)
{
if
(
output_mode
==
0
)
{
// index1, index2, ...
save_ptr
[
save_offset
]
=
((
uint32_t
*
)(
max_box
+
INFO_NUM
))[
0
];
}
else
if
(
output_mode
==
1
)
{
// score, x1, y1, x2, y2
__memcpy
(
save_ptr
+
save_offset
*
INFO_NUM
,
max_box
,
INFO_NUM
*
sizeof
(
IN_DT
),
NRAM2NRAM
,
INFO_NUM
*
sizeof
(
IN_DT
),
INFO_NUM
*
sizeof
(
IN_DT
),
0
);
}
else
if
(
output_mode
==
2
)
{
// score---, x1---, y1---, x2---, y2---
__memcpy
(
save_ptr
+
save_offset
,
max_box
,
1
*
sizeof
(
IN_DT
),
NRAM2NRAM
,
save_str_num
*
sizeof
(
IN_DT
),
1
*
sizeof
(
IN_DT
),
4
);
}
}
nram_save_count
++
;
output_box_num
++
;
}
// store to sram/gdram
if
(
output_box_num
!=
0
)
{
if
((
nram_save_count
==
nram_save_limit_count
)
||
(
float
(
max_box
[
0
])
<=
thresh_score
)
||
keep
==
max_output_size
-
1
)
{
if
(
nram_save_count
!=
0
)
{
if
(
clusterId
==
0
&&
coreId
==
0
)
{
if
(
output_mode
==
0
)
{
// index1, index2, ...
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
sizeof
(
uint32_t
),
NRAM2GDRAM
);
pvUnlock
();
output_dram
+=
nram_save_count
;
}
else
if
(
output_mode
==
1
)
{
// score, x1, y1, x2, y2
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
INFO_NUM
*
sizeof
(
IN_DT
),
NRAM2GDRAM
);
pvUnlock
();
output_dram
+=
nram_save_count
*
INFO_NUM
;
}
else
if
(
output_mode
==
2
)
{
// score---, x1---, y1---, x2---, y2---
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
sizeof
(
IN_DT
),
NRAM2GDRAM
,
max_output_size
*
sizeof
(
IN_DT
),
nram_save_limit_count
*
sizeof
(
IN_DT
),
4
);
pvUnlock
();
output_dram
+=
nram_save_count
;
}
nram_save_count
=
0
;
}
}
}
// if move data nram->sram/gdram
}
// if dst
}
template
<
typename
IN_DT
,
typename
OUT_DT
>
__mlu_func__
void
scoreUpdate
(
IN_DT
*
input_score_ptr
,
const
mluMemcpyDirection_t
load_dir
,
const
mluMemcpyDirection_t
store_dir
,
const
IN_DT
*
input_x1_ptr
,
const
IN_DT
*
input_y1_ptr
,
const
IN_DT
*
input_x2_ptr
,
const
IN_DT
*
input_y2_ptr
,
IN_DT
*
x1
,
IN_DT
*
y1
,
IN_DT
*
x2
,
IN_DT
*
y2
,
IN_DT
*
score
,
IN_DT
*
inter_x1
,
IN_DT
*
inter_y1
,
IN_DT
*
inter_x2
,
IN_DT
*
inter_y2
,
IN_DT
*
max_box
,
const
float
max_box_x1
,
const
float
max_box_y1
,
const
float
max_box_x2
,
const
float
max_box_y2
,
OUT_DT
*
nram_save
,
int
repeat_iou_compute
,
int
remain_iou_compute
,
int
remain_pad_iou_compute
,
int
max_seg_iou_compute
,
int
max_seg_pad
,
const
float
thresh_iou
,
const
float
div_thresh_iou
,
const
int
input_offset
,
const
float
offset
,
const
float
max_area
,
const
int
input_num_boxes
,
const
int
algo
)
{
for
(
int
i
=
0
;
i
<=
repeat_iou_compute
;
i
++
)
{
if
(
i
==
repeat_iou_compute
&&
remain_iou_compute
==
0
)
{
break
;
}
int
seg_len
=
(
i
==
repeat_iou_compute
)
?
remain_pad_iou_compute
:
max_seg_iou_compute
;
int
cpy_len
=
(
i
==
repeat_iou_compute
)
?
remain_iou_compute
:
max_seg_iou_compute
;
/******NMS LOAD START******/
int
dt_offset
=
0
;
if
(
sizeof
(
IN_DT
)
==
sizeof
(
float
))
{
__memcpy
(
score
,
input_score_ptr
+
input_offset
+
i
*
max_seg_pad
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
dt_offset
=
0
;
}
else
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__memcpy
(
x1
,
input_score_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
__bang_half2float
((
float
*
)
score
,
(
half
*
)
x1
,
seg_len
);
dt_offset
=
max_seg_iou_compute
;
}
#if __BANG_ARCH__ >= 300
__memcpy
(
inter_x1
+
dt_offset
,
input_x1_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
max_seg_pad
*
sizeof
(
IN_DT
),
input_num_boxes
*
sizeof
(
IN_DT
),
3
);
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__bang_half2float
((
float
*
)
inter_x1
,
(
half
*
)
inter_x1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_y1
,
(
half
*
)
inter_y1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_x2
,
(
half
*
)
inter_x2
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_y2
,
(
half
*
)
inter_y2
+
max_seg_iou_compute
,
seg_len
);
}
// box transfer
__bang_minequal
((
float
*
)
x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
seg_len
);
__bang_maxequal
((
float
*
)
x2
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
seg_len
);
__bang_minequal
((
float
*
)
y1
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
__bang_maxequal
((
float
*
)
y2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// 1、 compute IOU
// get the area_I
__bang_maxeq_scalar
((
float
*
)
inter_x1
,
(
float
*
)
x1
,
max_box_x1
,
seg_len
);
// inter_x1
__bang_mineq_scalar
((
float
*
)
inter_x2
,
(
float
*
)
x2
,
max_box_x2
,
seg_len
);
// inter_x2
__bang_sub
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
NULL
,
seg_len
);
// inter_w
__bang_maxeq_scalar
((
float
*
)
inter_y1
,
(
float
*
)
y1
,
float
(
max_box_y1
),
seg_len
);
// inter_y1
__bang_mineq_scalar
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
float
(
max_box_y2
),
seg_len
);
// inter_y2
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
(
float
*
)
inter_y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
NULL
,
seg_len
);
// inter_h
__bang_mul
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_y1
,
seg_len
);
// area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_fusion
(
FUSION_FSA
,
(
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
offset
,
seg_len
,
seg_len
);
__bang_fusion
(
FUSION_FSA
,
(
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
offset
,
seg_len
,
seg_len
);
__bang_mul
((
float
*
)
inter_x2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// area
}
else
{
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
seg_len
);
__bang_fusion
(
FUSION_FSM
,
(
float
*
)
inter_x2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
(
float
*
)
inter_y1
,
seg_len
,
seg_len
);
}
// get the area_U: area + max_area - area_I
__bang_fusion
(
FUSION_FAS
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
max_area
,
(
float
*
)
inter_x1
,
seg_len
,
seg_len
);
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if
(
thresh_iou
>
0.0
)
{
__bang_mul_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
div_thresh_iou
,
seg_len
);
}
else
{
__bang_mul_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
thresh_iou
,
seg_len
);
}
// process for nan
__bang_lt
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_not
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_mul
((
float
*
)
score
,
(
float
*
)
score
,
(
float
*
)
inter_x1
,
seg_len
);
/******NMS COMPUTE END******/
#else
__memcpy
(
x1
+
dt_offset
,
input_x1_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
max_seg_pad
*
sizeof
(
IN_DT
),
input_num_boxes
*
sizeof
(
IN_DT
),
3
);
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__bang_half2float
((
float
*
)
x1
,
(
half
*
)
x1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
y1
,
(
half
*
)
y1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
x2
,
(
half
*
)
x2
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
y2
,
(
half
*
)
y2
+
max_seg_iou_compute
,
seg_len
);
}
// 1、 compute IOU
// get the area_I
__bang_write_value
((
float
*
)
inter_y1
,
seg_len
,
float
(
max_box
[
1
]));
// max_x1
__bang_maxequal
((
float
*
)
inter_x1
,
(
float
*
)
x1
,
(
float
*
)
inter_y1
,
seg_len
);
// inter_x1
__bang_write_value
((
float
*
)
inter_y2
,
seg_len
,
float
(
max_box
[
3
]));
// max_x2
__bang_minequal
((
float
*
)
inter_x2
,
(
float
*
)
x2
,
(
float
*
)
inter_y2
,
seg_len
);
// inter_x2
__bang_sub
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
NULL
,
seg_len
);
// inter_w
__bang_write_value
((
float
*
)
inter_x2
,
seg_len
,
float
(
max_box
[
2
]));
// max_y1
__bang_maxequal
((
float
*
)
inter_y1
,
(
float
*
)
y1
,
(
float
*
)
inter_x2
,
seg_len
);
// inter_y1
__bang_write_value
((
float
*
)
inter_x2
,
seg_len
,
float
(
max_box
[
4
]));
// max_y2
__bang_minequal
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
inter_x2
,
seg_len
);
// inter_y2
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
(
float
*
)
inter_y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
NULL
,
seg_len
);
// inter_h
__bang_mul
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_y1
,
seg_len
);
// area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
seg_len
);
__bang_sub
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
__bang_add_scalar
((
float
*
)
inter_y2
,
(
float
*
)
inter_y2
,
offset
,
seg_len
);
}
__bang_mul
((
float
*
)
inter_x2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// area
// get the area_U: area + max_area - area_I
__bang_add_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
float
(
max_area
),
seg_len
);
__bang_sub
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
// area_U
// 2、 select the box
// if IOU greater than thresh, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if
(
thresh_iou
>
0.0
)
{
__bang_mul_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
div_thresh_iou
,
seg_len
);
}
else
{
__bang_mul_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
thresh_iou
,
seg_len
);
}
__bang_ge
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_mul
((
float
*
)
score
,
(
float
*
)
score
,
(
float
*
)
inter_x1
,
seg_len
);
/******NMS COMPUTE END******/
#endif
// update the score
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
convertFloat2half
((
half
*
)
score
,
(
float
*
)
score
,
seg_len
);
}
pvLock
();
__memcpy
(
input_score_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
score
,
cpy_len
*
sizeof
(
IN_DT
),
store_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
pvUnlock
();
}
}
#endif // NMS_UTILS_HPP_
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "psamask_utils.hpp"
#define COMPUTE_COUNT_ALIGN 64
__nram__ char buf[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
const PositionInCore &position,
const Shape &shape_full) {
int n_offset = shape_full.h * shape_full.w * shape_full.c;
int h_offset = shape_full.w * shape_full.c;
int w_offset = shape_full.c;
int n_seg = position.n_end - position.n_start;
int h_seg = position.h_end - position.h_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
position.w_start * w_offset,
src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
size * sizeof(T), n_seg - 1);
}
template <typename T>
__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
const PositionInCore &position,
const Shape &shape_full) {
int n_offset = shape_full.h * shape_full.w * shape_full.c;
int h_offset = shape_full.w * shape_full.c;
int w_offset = shape_full.c;
int n_seg = position.n_end - position.n_start;
int h_seg = position.h_end - position.h_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst, src + position.n_start * n_offset +
position.h_start * h_offset + position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1);
}
// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
template <typename T>
__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
for (int i = 0; i < shape_seg.n; ++i) {
__bang_transpose(dst, src, align_hw, align_c);
dst += align_hw * align_c;
src += align_hw * align_c;
}
}
template <typename T>
__mlu_func__ void psamaskCollectForward(
const T *x_dram, T *y_dram, const PositionInCore &position,
const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *x_nram = (T *)buf;
T *y_nram =
x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(x_nram, x_dram, position, x_full);
// fill zeros to output
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T));
__bang_write_value(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
int y_w_offset = shape_seg.c;
int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
int y_c_offset = 1;
int x_h_offset = shape_seg.w * x_full.c;
int x_w_offset = x_full.c;
int x_c_offset = 1;
int x_start = 0;
int y_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int y_offset = y_start;
int x_offset = x_start;
y_offset += hidx * y_h_offset + widx * y_w_offset;
x_offset += hidx * x_h_offset + widx * x_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = x_full.h + half_h_mask - h_abs < h_mask
? x_full.h + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = x_full.w + half_w_mask - w_abs < w_mask
? x_full.w + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
w_abs - half_w_mask) *
y_c_offset;
x_offset += (hstart * w_mask + wstart) * x_c_offset;
int count = wend - wstart;
__memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
}
}
y_start += y_n_offset;
x_start += x_n_offset;
}
storeDataFromNramToDram(y_dram, y_nram, position, y_full);
}
template <typename T>
__mlu_func__ void psamaskDistributeForward(
const T *x_dram, T *y_dram, const PositionInCore &position,
const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *x_nram = (T *)buf;
T *y_nram_temp =
x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(x_nram, x_dram, position, x_full);
// fill zeros to output
int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__bang_write_value(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c;
int y_w_offset = align_c;
int y_c_offset = 1;
int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
int x_h_offset = shape_seg.w * x_full.c;
int x_w_offset = x_full.c;
int x_c_offset = 1;
int h_feature = y_full.h;
int w_feature = y_full.w;
int y_start = 0;
int x_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int y_offset = y_start;
int x_offset = x_start;
y_offset += hidx * y_h_offset + widx * y_w_offset;
x_offset += hidx * x_h_offset + widx * x_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
w_abs - half_w_mask) *
y_c_offset;
x_offset += (hstart * w_mask + wstart) * x_c_offset;
int count = wend - wstart;
__memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
}
}
y_start += y_n_offset;
x_start += x_n_offset;
}
// transpose y
T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
transposeData(y_nram, y_nram_temp, y_seg);
swap(align_c, align_hw);
// store y from nram to dram
int y_n_offset_full = y_full.h * y_full.w * y_full.c;
int y_w_offset_full = y_full.c;
int y_c_offset_full = 1;
int y_dram_start =
position.n_start * y_n_offset_full +
(position.h_start * y_full.w + position.w_start) * y_c_offset_full;
int y_nram_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
__memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
y_w_offset_full * sizeof(T), align_c * sizeof(T),
h_feature * w_feature - 1);
}
}
template <typename T>
__mlu_func__ void psamaskCollectBackward(
const T *dy_dram, T *dx_dram, const PositionInCore &position,
const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *dy_nram = (T *)buf;
T *dx_nram =
dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
// fill zeros to output
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T));
__bang_write_value(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
int dy_w_offset = dy_full.c;
int dy_c_offset = 1;
int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
int dx_h_offset = shape_seg.w * dx_full.c;
int dx_w_offset = dx_full.c;
int dx_c_offset = 1;
int h_feature = dy_full.h;
int w_feature = dy_full.w;
int dy_start = 0;
int dx_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int dy_offset = dy_start;
int dx_offset = dx_start;
dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
// feature-indexed
dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
w_abs - half_w_mask) *
dy_c_offset;
dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
int count = wend - wstart;
__memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
}
}
dy_start += dy_n_offset;
dx_start += dx_n_offset;
}
storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
}
template <typename T>
__mlu_func__ void psamaskDistributeBackward(
const T *dy_dram, T *dx_dram, const PositionInCore &position,
const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
// load dy from dram to nram
T *dy_nram_temp = (T *)buf;
int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
int dy_c_offset_full = 1;
int h_feature = dy_full.h;
int w_feature = dy_full.w;
int align_c =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
int dy_dram_start =
position.n_start * dy_n_offset_full +
(position.h_start * w_feature + position.w_start) * dy_c_offset_full;
int dy_nram_start = 0;
for (int i = 0; i < shape_seg.n; ++i) {
int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
__memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
align_c * sizeof(T), dy_full.c * sizeof(T),
h_feature * w_feature - 1);
}
T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
transposeData(dy_nram, dy_nram_temp, dy_seg);
swap(align_c, align_hw);
// fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
int dy_w_offset_seg = align_c;
int dy_c_offset_seg = 1;
int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
int dx_h_offset_seg = shape_seg.w * shape_seg.c;
int dx_w_offset_seg = shape_seg.c;
int dx_c_offset_seg = 1;
int dy_start = 0;
int dx_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int dy_offset = dy_start;
int dx_offset = dx_start;
dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
// feature-indexed
dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
w_abs - half_w_mask) *
dy_c_offset_seg;
dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
int count = wend - wstart;
__memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
}
}
dy_start += dy_n_offset_seg;
dx_start += dx_n_offset_seg;
}
storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
}
template <typename T>
__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
const Shape &input_full, const Shape &output_full,
LimitParam &limit, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition,
const bool is_forward, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const int n_per_core,
const int h_per_core, const int n_per_cluster,
const int h_per_cluster) {
PositionInCore position_full;
PositionInCore position_seg;
position_full.w_start = 0;
position_full.w_end = output_full.w;
int n_num_in_cluster = n_per_cluster;
int h_num_in_cluster = h_per_cluster;
switch (cluster_partition) {
case PARTITION_N: {
position_full.h_start = 0;
position_full.h_end = input_full.h;
position_full.n_start = taskIdY * n_per_cluster;
int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
if (taskIdY >= cluster_need) return;
int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
n_num_in_cluster =
(taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
position_full.n_end = position_full.n_start + n_num_in_cluster;
}; break;
case PARTITION_H: {
position_full.n_start = 0;
position_full.n_end = input_full.n;
position_full.h_start = taskIdY * h_per_cluster;
int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
if (taskIdY >= cluster_need) return;
int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
h_num_in_cluster =
(taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
position_full.h_end = position_full.h_start + h_num_in_cluster;
}; break;
}
switch (core_partition) {
case PARTITION_N: {
position_full.n_start += taskIdX * n_per_core;
int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
if (taskIdX >= core_need) return;
int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
position_full.n_end =
position_full.n_start +
((taskIdX == core_need - 1) ? n_remainder : n_per_core);
}; break;
case PARTITION_H: {
position_full.h_start += taskIdX * h_per_core;
int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
if (taskIdX >= core_need) return;
int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
position_full.h_end =
position_full.h_start +
((taskIdX == core_need - 1) ? h_remainder : h_per_core);
}; break;
}
// the count of n ,h and w need to be processed in the current core
int shape_core_n = position_full.n_end - position_full.n_start;
int shape_core_h = position_full.h_end - position_full.h_start;
int shape_core_w = input_full.w;
limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
// load the data to nram according to the limit
for (int nidx = position_full.n_start; nidx < position_full.n_end;
nidx += limit.n) {
position_seg.n_start = nidx;
position_seg.n_end =
position_seg.n_start + (position_full.n_end - nidx < limit.n
? position_full.n_end - nidx
: limit.n);
for (int hidx = position_full.h_start; hidx < position_full.h_end;
hidx += limit.h) {
position_seg.h_start = hidx;
position_seg.h_end =
position_seg.h_start + (position_full.h_end - hidx < limit.h
? position_full.h_end - hidx
: limit.h);
for (int widx = position_full.w_start; widx < position_full.w_end;
widx += limit.w) {
position_seg.w_start = widx;
position_seg.w_end =
position_seg.w_start + (position_full.w_end - widx < limit.w
? position_full.w_end - widx
: limit.w);
// record the segment of output except the size of channel
// channel segments of output and input are the same
Shape shape_seg;
shape_seg.n = position_seg.n_end - position_seg.n_start;
shape_seg.h = position_seg.h_end - position_seg.h_start;
shape_seg.w = position_seg.w_end - position_seg.w_start;
shape_seg.c = output_full.c;
switch (psa_type) {
case COLLECT: {
if (is_forward) {
psamaskCollectForward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg, h_mask,
w_mask, half_h_mask, half_w_mask);
} else {
psamaskCollectBackward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg, h_mask,
w_mask, half_h_mask, half_w_mask);
}
} break;
case DISTRIBUTE: {
if (is_forward) {
psamaskDistributeForward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg,
h_mask, w_mask, half_h_mask,
half_w_mask);
} else {
psamaskDistributeBackward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg,
h_mask, w_mask, half_h_mask,
half_w_mask);
}
} break;
}
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelPsamaskForward(
const T *x, T *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
if (coreId == 0x80) {
return;
}
Shape x_full, y_full;
x_full.n = batch;
x_full.h = h_feature;
x_full.w = w_feature;
x_full.c = x_c;
y_full.n = batch;
y_full.h = h_feature;
y_full.w = w_feature;
y_full.c = y_c;
LimitParam limit;
limit.n = limit_n_seg;
limit.h = limit_h_seg;
limit.w = limit_w_seg;
psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
n_per_core, h_per_core, n_per_cluster, h_per_cluster);
}
template <typename T>
__mlu_global__ void MLUUnion1KernelPsamaskBackward(
const T *dy, T *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
if (coreId == 0x80) {
return;
}
Shape dy_full, dx_full;
dx_full.n = batch;
dx_full.h = h_feature;
dx_full.w = w_feature;
dx_full.c = dx_c;
dy_full.n = batch;
dy_full.h = h_feature;
dy_full.w = w_feature;
dy_full.c = dy_c;
LimitParam limit;
limit.n = limit_n_seg;
limit.h = limit_h_seg;
limit.w = limit_w_seg;
psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
cluster_partition, false, h_mask, w_mask, half_h_mask,
half_w_mask, n_per_core, h_per_core, n_per_cluster,
h_per_cluster);
}
void KernelPsamaskForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *x, void *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
static_cast<const float *>(x), static_cast<float *>(y), psa_type,
core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
}
void KernelPsamaskBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *dy, void *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
}
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef PSAMASK_UTILS_HPP_
#define PSAMASK_UTILS_HPP_
typedef
enum
{
COLLECT
=
0
,
DISTRIBUTE
=
1
,
}
PsamaskType
;
typedef
enum
{
PARTITION_N
=
0
,
PARTITION_H
=
1
,
}
DimPartitionType
;
struct
PartitionSeg
{
int
h_per_cluster
;
int
n_per_cluster
;
int
h_per_core
;
int
n_per_core
;
DimPartitionType
cluster_partition
;
DimPartitionType
core_partition
;
};
struct
Shape
{
int
n
;
int
h
;
int
w
;
int
c
;
};
struct
LimitParam
{
int
n
;
int
h
;
int
w
;
};
struct
PositionInCore
{
int
n_start
;
int
n_end
;
int
h_start
;
int
h_end
;
int
w_start
;
int
w_end
;
};
#endif // PSAMASK_UTILS_HPP_
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ROI_OFFSET 5
__nram__ char buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void bilinearInterpolate(const int input_height,
const int input_width, T y, T x, T *w1,
T *w2, T *w3, T *w4, int *x_low,
int *x_high, int *y_low, int *y_high,
bool *empty) {
// deal with cases that inverse elements are of feature map boundary
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
*empty = true;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low_ = int(y);
int x_low_ = int(x);
if (y_low_ >= input_height - 1) {
*y_high = y_low_ = input_height - 1;
y = (T)y_low_;
} else {
*y_high = y_low_ + 1;
}
if (x_low_ >= input_width - 1) {
*x_high = x_low_ = input_width - 1;
x = T(x_low_);
} else {
*x_high = x_low_ + 1;
}
*y_low = y_low_;
*x_low = x_low_;
T ly = y - y_low_;
T lx = x - x_low_;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
T *nram_out, const int roi_bin_grid_h,
const int roi_bin_grid_w, const T roi_start_h,
const T roi_start_w, const int ph,
const int pw, const T bin_size_h,
const T bin_size_w, const float count,
const int input_height, const int input_width,
const int channels, const int cyc_num,
const int max_elements) {
int cyc_channel = max_elements;
for (int i = 0; i < cyc_num; i++) {
int real_channel =
(i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
__bang_write_zero(nram_out, align_channel);
uint32_t real_size = real_channel * sizeof(T);
int iy, ix;
for (iy = 0; iy < roi_bin_grid_h; iy++) {
// 1. compute the coordinates of the y axis in the current roi_bin_grid_h
T y = roi_start_h + ph * bin_size_h +
(T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
for (ix = 0; ix < roi_bin_grid_w; ix++) {
// 2. compute the coordinates of the x axis in the current
// roi_bin_grid_w
T x = roi_start_w + pw * bin_size_w +
(T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
// 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
// and y_high) and weight (x_low and x_high) of input feature map in
// the current roi bin grid, and the flag (empty) which shows if x, y
// are out of input feature map ranges
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bool empty = false;
bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
&x_low, &x_high, &y_low, &y_high, &empty);
// 4. compute interpolation of the current roi bin grid
// tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
// to compute the interpolation, and then reused to compute
// the argmax_x and argmax_y.
T *tmp_cyc1 = nram_in + cyc_channel;
T *tmp_cyc2 = nram_in + cyc_channel * 2;
T *tmp_cyc3 = nram_in + cyc_channel * 3;
T *tmp_cyc4 = nram_in + cyc_channel * 4;
if (empty) { // exits abnormal values
__bang_write_zero(nram_in, align_channel);
} else {
__bang_write_zero(nram_in, align_channel);
uint32_t offset1 = (y_low * input_width + x_low) * channels;
uint32_t offset2 = (y_low * input_width + x_high) * channels;
uint32_t offset3 = (y_high * input_width + x_low) * channels;
uint32_t offset4 = (y_high * input_width + x_high) * channels;
T *input1 = (T *)input_core + offset1 + i * cyc_channel;
T *input2 = (T *)input_core + offset2 + i * cyc_channel;
T *input3 = (T *)input_core + offset3 + i * cyc_channel;
T *input4 = (T *)input_core + offset4 + i * cyc_channel;
// load the four pixels (p1, p2, p3 and p4) of input feature map to
// compute interpolation
__memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
}
// 5. compute sum value and corresponding coordinates of x axis and y
// axis. Update the sum value.
__bang_add(nram_out, nram_in, nram_out, align_channel);
} // loop_roi_grid_w
} // loop_roi_grid_h
T count_value = (T)(1.0 / count);
__bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num
}
template <typename T>
__mlu_func__ void roialignForwardAvg(
T *input, T *rois, T *output, const bool aligned, const int channels,
const int pooled_height, const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio, const T spatial_scale,
const int num_rois) {
// find limit for channel, the nram space is divided to 6 parts that are
// input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
// max_elements : 300 : float datatype : 27296, half datatype : 54592
// max_elements : 200 : float datatype : 16384, half datatype : 32768
int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
T offset = aligned ? (T)0.5 : (T)0.0;
int task_num = num_rois * pooled_height * pooled_width;
T *nram_out = (T *)buffer;
T *nram_in = nram_out + max_elements;
if (task_num < taskDim) {
if (taskId >= task_num) {
return;
}
}
for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
if (bin_idx >= task_num) {
return;
}
// (n,ph.pw) is a c in the pooled output
int pw = bin_idx % pooled_width;
int ph = (bin_idx / pooled_width) % pooled_height;
int n = bin_idx / pooled_width / pooled_height;
T *roi_id_tmp = rois + n * ROI_OFFSET;
// 1. compute width and height of roi region.
int batch_idx = (int)roi_id_tmp[0];
T roi_x1 = roi_id_tmp[1];
T roi_y1 = roi_id_tmp[2];
T roi_x2 = roi_id_tmp[3];
T roi_y2 = roi_id_tmp[4];
T roi_start_w = roi_x1 * spatial_scale - offset;
T roi_start_h = roi_y1 * spatial_scale - offset;
T roi_end_w = roi_x2 * spatial_scale - offset;
T roi_end_h = roi_y2 * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) {
roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
}
// 2. compute float-type width and height of roi bin region.
T bin_size_w = (T)roi_width / (T)pooled_width;
T bin_size_h = (T)roi_height / (T)pooled_height;
// 3. compute int-type width and height of roi bin region.
int roi_bin_grid_h, roi_bin_grid_w;
roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: int(ceilf(roi_height / pooled_height));
roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: int(ceilf(roi_width / pooled_width));
float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1.0);
T *input_core = input + batch_idx * channels * input_width * input_height;
T *output_core = output + bin_idx * channels;
// 4. compute avg value and corresponding coordinates of x axis and y axis.
computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
bin_size_w, count, input_height, input_width, channels,
cyc_num, max_elements);
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
const void *input, const void *rois, const int channels, const bool aligned,
const int pooled_height, const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio, const float spatial_scale,
const int num_rois, const cnrtDataType_t data_type, void *output) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_type) {
case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, (half)spatial_scale,
num_rois);
}; break;
case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
aligned, channels, pooled_height, pooled_width,
input_height, input_width, sampling_ratio,
(float)spatial_scale, num_rois);
}; break;
default:
break;
}
return;
}
} // namespace forward
namespace backward {
__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
float x, float *w1, float *w2,
float *w3, float *w4, int *x_low,
int *x_high, int *y_low,
int *y_high) {
if (y < -1.0 || y > height || x < -1.0 || x > width) {
*w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
*x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
return;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
*y_low = (int)y;
*x_low = (int)x;
if (*y_low >= height - 1) {
*y_high = height - 1, *y_low = height - 1;
y = (float)(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= width - 1) {
*x_high = width - 1, *x_low = width - 1;
x = (float)(*x_low);
} else {
*x_high = *x_low + 1;
}
float ly = y - *y_low, lx = x - *x_low;
float hy = 1.0 - ly, hx = 1.0 - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void unionRoiAlignBp(
T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
const int wi, const int c, const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio, const bool aligned) {
int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
int deal_all = boxes_num * hi * wi;
int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
for (int i = 0; i < deal_this_core; ++i) {
int bhw_id = i * taskDim + taskId;
int box_id = bhw_id / (hi * wi);
int ih = (bhw_id / wi) % hi;
int iw = bhw_id % wi;
T *box = boxes + box_id * 5;
int image_id = (int)box[0];
T *image_offset = grads_image + image_id * ho * wo * c;
T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
float offset = aligned ? 0.5 : 0.0;
float x1 = box[1] * spatial_scale - offset;
float y1 = box[2] * spatial_scale - offset;
float x2 = box[3] * spatial_scale - offset;
float y2 = box[4] * spatial_scale - offset;
float roi_width = x2 - x1;
float roi_height = y2 - y1;
if (!aligned) {
roi_width = (roi_width > 1.0) ? roi_width : 1.0;
roi_height = (roi_height > 1.0) ? roi_height : 1.0;
}
float bin_size_h = roi_height / hi;
float bin_size_w = roi_width / wi;
int roi_grid_h =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
int roi_grid_w =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
const T count = roi_grid_h * roi_grid_w;
if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
for (int iy = 0; iy < roi_grid_h; ++iy) {
const float y =
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
for (int ix = 0; ix < roi_grid_w; ++ix) {
const float x =
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
float w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c);
} // x_low && y_low
} // ix
} // iy
} else {
for (int iy = 0; iy < roi_grid_h; ++iy) {
const float y =
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
for (int ix = 0; ix < roi_grid_w; ++ix) {
const float x =
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
float w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
int deal_once =
PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
int c_repeat = c / deal_once + (int)(c % deal_once != 0);
for (int i = 0; i < c_repeat; ++i) {
int deal_c = deal_once;
int align_c = deal_once;
if (i == c_repeat - 1) {
deal_c = c - i * deal_once;
align_c = c_align - i * deal_once;
}
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
} // for c_repeat
} // x_low >= 0 && y_low >= 0
} // ix
} // iy
} // if c
} // i
}
__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
const void *grads, const void *boxes, void *grads_image,
const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
const int c, const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio, const bool aligned) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (dtype) {
case CNRT_FLOAT16: {
unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
sampling_ratio, aligned);
}; break;
case CNRT_FLOAT32: {
unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
sampling_ratio, aligned);
}; break;
default: { return; }
}
}
} // namespace backward
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *rois, const int channels,
const bool aligned, const int pooled_height,
const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio,
const float spatial_scale, const int num_rois,
void *output) {
forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
input, rois, channels, aligned, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
}
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t dtype,
const void *grads, const void *boxes,
void *grads_image, const int boxes_num,
const int hi, const int wi, const int c,
const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio,
const bool aligned) {
backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
spatial_scale, sampling_ratio, aligned);
}
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "roi_align_rotated_utils.hpp"
#define ROI_OFFSET 6
#define SAMPLING_NUM 4
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__mlu_func__ void bilinearInterpolate(const int input_height,
const int input_width, T x, T y, T *w1,
T *w2, T *w3, T *w4, int *x_low,
int *x_high, int *y_low, int *y_high,
bool *empty) {
// deal with case that the point is out of feature map boundary
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
*empty = true;
return;
}
if (y <= 0) y = (T)0;
if (x <= 0) x = (T)0;
*y_low = int(y);
*x_low = int(x);
if (*y_low >= input_height - 1) {
*y_high = *y_low = input_height - 1;
y = (T)(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= input_width - 1) {
*x_high = *x_low = input_width - 1;
x = T(*x_low);
} else {
*x_high = *x_low + 1;
}
T ly = y - *y_low;
T lx = x - *x_low;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx;
*w2 = hy * lx;
*w3 = ly * hx;
*w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
const RoiAlignRotatedParams ¶ms,
int *batch_idx, int *roi_n, int *pw, int *ph,
T *roi_center_x, T *roi_center_y, T *roi_width,
T *roi_height, T *theta) {
T offset = params.aligned ? (T)0.5 : (T)0.0;
*pw = bin_i % params.pooled_width;
*ph = (bin_i / params.pooled_width) % params.pooled_height;
*roi_n = bin_i / params.pooled_width / params.pooled_height;
const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
*batch_idx = (int)roi_info[0];
*roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
*roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
*roi_width = roi_info[3] * (T)params.spatial_scale;
*roi_height = roi_info[4] * (T)params.spatial_scale;
*theta = roi_info[5];
if (params.clockwise) {
*theta = -(*theta);
}
if (!params.aligned) {
*roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
*roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
}
}
template <typename T>
__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
const T *rois_dram, const int batch,
const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams ¶ms,
T *output_dram) {
int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
channel_align = CEIL_ALIGN(channel_align, align_base_128);
T *nram_out = (T *)nram_buffer;
T *nram_ping = nram_out + channel_align;
T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
int bin_first = taskId;
int bin_end = rois_num * params.pooled_height * params.pooled_width;
for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
T roi_center_x, roi_center_y, roi_width, roi_height, theta;
int batch_idx, roi_n, pw, ph;
getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
&roi_center_x, &roi_center_y, &roi_width, &roi_height,
&theta);
T bin_size_h = roi_height / params.pooled_height;
T bin_size_w = roi_width / params.pooled_width;
int roi_bin_grid_h =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_height / params.pooled_height);
int roi_bin_grid_w =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_width / params.pooled_width);
T roi_start_y = -roi_height / 2;
T roi_start_x = -roi_width / 2;
const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1;
T cos_theta = std::cos(theta);
T sin_theta = std::sin(theta);
T zero_sign = 1.0f / bin_dim;
bool is_first_sample = true;
int src_offset = 0;
int dst_offset = 0;
int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
__bang_write_value(nram_out, channel_align, (T)0);
c_rem = channel - c_offset;
c_slice = channel_align > c_rem ? c_rem : channel_align;
c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
is_first_sample = true;
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
const T yy = roi_start_y + ph * bin_size_h +
T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
const T xx = roi_start_x + pw * bin_size_w +
T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
int sample_i = iy * roi_bin_grid_w + ix;
T y = yy * cos_theta - xx * sin_theta + roi_center_y;
T x = yy * sin_theta + xx * cos_theta + roi_center_x;
T w1, w2, w3, w4;
bool empty = false;
int x_low, x_high, y_low, y_high;
bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high, &empty);
/*******************************************************
| ping | pong |
|------|-----|-----|-----|-----|-----|-----|-----|-----|
|output| p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 |
|------|-----|-----|-----|-----|-----|-----|-----|-----|
********************************************************/
if (is_first_sample && !empty) {
// load input data from dram to nram
__bang_write_value(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
src_offset =
(batch_idx * height * width + y_low * width + x_low) * channel +
c_offset;
dst_offset = 0;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset = (batch_idx * height * width + y_low * width + x_high) *
channel +
c_offset;
dst_offset = c_slice_align;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset = (batch_idx * height * width + y_high * width + x_low) *
channel +
c_offset;
dst_offset = c_slice_align * 2;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + y_high * width + x_high) *
channel +
c_offset;
dst_offset = c_slice_align * 3;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
// load next input data to nram
if (sample_i + 1 < bin_dim) {
int p_iy = (sample_i + 1) / roi_bin_grid_w;
int p_ix = (sample_i + 1) % roi_bin_grid_w;
const T p_yy = roi_start_y + ph * bin_size_h +
T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
const T p_xx = roi_start_x + pw * bin_size_w +
T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
T p_w1, p_w2, p_w3, p_w4;
bool p_empty = false;
int p_x_low, p_x_high, p_y_low, p_y_high;
bilinearInterpolate(height, width, p_x, p_y, &p_w1, &p_w2, &p_w3,
&p_w4, &p_x_low, &p_x_high, &p_y_low, &p_y_high,
&p_empty);
pongc_slice = c_slice;
pongc_slice_align = c_slice_align;
if (!p_empty) {
__bang_write_value(nram_pong, SAMPLING_NUM * pongc_slice_align,
(T)0);
src_offset =
(batch_idx * height * width + p_y_low * width + p_x_low) *
channel +
c_offset;
dst_offset = 0;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + p_y_low * width + p_x_high) *
channel +
c_offset;
dst_offset = pongc_slice_align;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + p_y_high * width + p_x_low) *
channel +
c_offset;
dst_offset = pongc_slice_align * 2;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + p_y_high * width + p_x_high) *
channel +
c_offset;
dst_offset = pongc_slice_align * 3;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
}
T *tmp_sum = nram_ping + 3 * c_slice_align;
if (empty) {
__bang_write_value(tmp_sum, c_slice_align, T(0));
} else {
__bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
__bang_mul_scalar(nram_ping + c_slice_align,
nram_ping + c_slice_align, w2, c_slice_align);
__bang_mul_scalar(nram_ping + 2 * c_slice_align,
nram_ping + 2 * c_slice_align, w3, c_slice_align);
__bang_mul_scalar(nram_ping + 3 * c_slice_align,
nram_ping + 3 * c_slice_align, w4, c_slice_align);
__bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
1, SAMPLING_NUM, 1, 1);
}
__bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
swap(nram_ping, nram_pong);
__asm__ volatile("sync;");
is_first_sample = false;
}
}
__bang_mul_scalar(nram_out, nram_out, zero_sign, c_slice_align);
// store the result to dram
int output_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel +
c_offset;
__memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
NRAM2GDRAM);
}
}
}
template <typename T>
__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
const T *rois_dram, const int batch,
const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams ¶ms,
T *bottom_grad_dram) {
int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
int channel_align = CEIL_ALIGN(channel, align_base_128);
unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
int c_limit = max_element >> 2;
c_limit = c_limit > channel_align ? channel_align : c_limit;
T *nram_ping = (T *)nram_buffer;
T *nram_pong = nram_ping + 2 * c_limit;
T *nram_output = nullptr;
int bin_first = taskId;
int bin_end = rois_num * params.pooled_height * params.pooled_width;
bool is_first_bin = true;
T roi_center_x, roi_center_y, roi_width, roi_height, theta;
int batch_idx, roi_n, pw, ph;
T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
pong_theta;
int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
&roi_center_x, &roi_center_y, &roi_width, &roi_height,
&theta);
T bin_size_h = roi_height / params.pooled_height;
T bin_size_w = roi_width / params.pooled_width;
int roi_bin_grid_h =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_height / params.pooled_height);
int roi_bin_grid_w =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_width / params.pooled_width);
T roi_start_y = -roi_height / 2;
T roi_start_x = -roi_width / 2;
const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1;
T cos_theta = std::cos(theta);
T sin_theta = std::sin(theta);
T zero_sign = 1.0f / bin_dim;
int c_rem, c_slice, pongc_slice, c_offset;
c_rem = channel;
c_offset = 0;
/****************************************
| ping | pong |
|---------|---------|---------|---------|
| input | output | input | output |
|---------|---------|---------|---------|
*****************************************/
if (is_first_bin) {
// load the first top_grad to nram
c_slice = c_limit < c_rem ? c_limit : c_rem;
int top_grad_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel;
__memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
GDRAM2NRAM);
}
nram_output = nram_ping + c_limit;
while (c_rem > 0) {
c_slice = c_slice < c_rem ? c_slice : c_rem;
// load the next top_grad to nram
if (c_rem - c_slice > 0) {
// load the rest channels to nram
pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
int top_grad_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel +
c_offset + c_slice;
__memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
} else if (bin_i + taskDim < bin_end) {
// load next bin's data to nram
getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
&pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
&pong_roi_center_y, &pong_roi_width, &pong_roi_height,
&pong_theta);
pongc_slice = c_limit < channel ? c_limit : channel;
int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
params.pooled_width +
pong_pw) *
channel;
__memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
// comput the output in a single bin
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
const T yy = roi_start_y + ph * bin_size_h +
T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
const T xx = roi_start_x + pw * bin_size_w +
T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
T y = yy * cos_theta - xx * sin_theta + roi_center_y;
T x = yy * sin_theta + xx * cos_theta + roi_center_x;
T w1, w2, w3, w4;
bool empty = false;
int x_low, x_high, y_low, y_high;
bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high, &empty);
if (empty) {
continue;
} else {
__bang_mul_scalar(nram_output, nram_ping, w1 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w2 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w3 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w4 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
}
}
}
swap(nram_ping, nram_pong);
c_rem -= c_slice;
c_offset += c_slice;
__asm__ volatile("sync;");
}
is_first_bin = false;
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
const void *features, const void *rois, void *output, const int batch,
const int height, const int width, const int channel, const int rois_num,
const RoiAlignRotatedParams rroiAlignParams,
const cnrtDataType_t data_type) {
if (0x80 == coreId) {
return;
}
if (data_type == CNRT_FLOAT32) {
roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(float *)output);
} else {
roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
channel, rois_num, rroiAlignParams, (half *)output);
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
const void *top_grad, const void *rois, void *bottom_grad, const int batch,
const int height, const int width, const int channel, const int rois_num,
const RoiAlignRotatedParams rroiAlignParams,
const cnrtDataType_t data_type) {
if (0x80 == coreId) {
return;
}
if (data_type == CNRT_FLOAT32) {
roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(float *)bottom_grad);
} else {
roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(half *)bottom_grad);
}
}
void KernelRoiAlignRotatedForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *features, const void *rois,
void *output, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams) {
MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
features, rois, output, batch, height, width, channel, rois_num,
roiAlignRotatedParams, d_type);
}
void KernelRoiAlignRotatedBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *top_grad, const void *rois,
void *bottom_grad, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams) {
MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
roiAlignRotatedParams, d_type);
}
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
#define ROI_ALIGN_ROTATED_UTILS_HPP_
struct
RoiAlignRotatedParams
{
int
pooled_height
;
int
pooled_width
;
int
sample_ratio
;
float
spatial_scale
;
bool
aligned
;
bool
clockwise
;
};
#endif // ROI_ALIGN_ROTATED_UTILS_HPP_
mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ROI_OFFSET 7
#define FLOAT_NRAM_BUFFER_NUM 14
#define HALF_NRAM_BUFFER_NUM 25
#define ALIGN_NUM 64
__nram__ char data_nram[MAX_NRAM_SIZE];
template <typename T>
__mlu_global__ void MLUUnion1KernelPtsIdxOfVoxels(
const int pool_method, const int boxes_num, const int pts_num,
const int max_pts_each_voxel, const int out_x, const int out_y,
const int out_z, const T *rois, const T *pts, int *pts_idx_of_voxels) {
// params (T)rois: (boxes_num, 7)
// params (T)pts: (3, pts_num)
// params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
// max_pts_each_voxel)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int nram_pts_num = 0;
if (sizeof(T) == sizeof(float)) {
nram_pts_num = PAD_DOWN(
(MAX_NRAM_SIZE / sizeof(float) / FLOAT_NRAM_BUFFER_NUM), ALIGN_NUM);
} else {
nram_pts_num = PAD_DOWN(
(MAX_NRAM_SIZE / sizeof(half) / HALF_NRAM_BUFFER_NUM), ALIGN_NUM);
}
char *X = NULL;
char *Y = NULL;
char *Z = NULL;
char *local_X = NULL;
char *local_Y = NULL;
char *local_Z = NULL;
char *nram_pts_in_flag = NULL;
float *temp_buffer1 = NULL;
float *temp_buffer2 = NULL;
float *temp_buffer3 = NULL;
float *temp_buffer4 = NULL;
float *temp_buffer5 = NULL;
float *nram_voxel_offset = NULL;
int *nram_pts_idx_seq = NULL;
float *fp_local_X = NULL;
float *fp_local_Y = NULL;
float *fp_local_Z = NULL;
float *fp_nram_pts_in_flag = NULL;
if (sizeof(T) == sizeof(float)) {
X = (char *)((float *)data_nram);
Y = (char *)((float *)data_nram + nram_pts_num);
Z = (char *)((float *)data_nram + nram_pts_num * 2);
local_X = (char *)((float *)data_nram + nram_pts_num * 3);
local_Y = (char *)((float *)data_nram + nram_pts_num * 4);
local_Z = (char *)((float *)data_nram + nram_pts_num * 5);
nram_pts_in_flag = (char *)((float *)data_nram + nram_pts_num * 6);
temp_buffer1 = (float *)data_nram + nram_pts_num * 7;
temp_buffer2 = (float *)data_nram + nram_pts_num * 8;
temp_buffer3 = (float *)data_nram + nram_pts_num * 9;
temp_buffer4 = (float *)data_nram + nram_pts_num * 10;
temp_buffer5 = (float *)data_nram + nram_pts_num * 11;
nram_voxel_offset = (float *)data_nram + nram_pts_num * 12;
nram_pts_idx_seq = (int *)((float *)data_nram + nram_pts_num * 13);
fp_local_X = (float *)local_X;
fp_local_Y = (float *)local_Y;
fp_local_Z = (float *)local_Z;
fp_nram_pts_in_flag = (float *)nram_pts_in_flag;
} else {
X = (char *)((half *)data_nram);
Y = (char *)((half *)data_nram + nram_pts_num);
Z = (char *)((half *)data_nram + nram_pts_num * 2);
local_X = (char *)((half *)data_nram + nram_pts_num * 4);
local_Y = (char *)((half *)data_nram + nram_pts_num * 6);
local_Z = (char *)((half *)data_nram + nram_pts_num * 8);
nram_pts_in_flag = (char *)((half *)data_nram + nram_pts_num * 10);
temp_buffer1 = (float *)((half *)data_nram + nram_pts_num * 11);
temp_buffer2 = (float *)((half *)data_nram + nram_pts_num * 13);
temp_buffer3 = (float *)((half *)data_nram + nram_pts_num * 15);
temp_buffer4 = (float *)((half *)data_nram + nram_pts_num * 17);
temp_buffer5 = (float *)((half *)data_nram + nram_pts_num * 19);
nram_voxel_offset = (float *)((half *)data_nram + nram_pts_num * 21);
nram_pts_idx_seq = (int *)((half *)data_nram + nram_pts_num * 23);
fp_local_X = (float *)((half *)local_X - nram_pts_num);
fp_local_Y = (float *)((half *)local_Y - nram_pts_num);
fp_local_Z = (float *)((half *)local_Z - nram_pts_num);
fp_nram_pts_in_flag = (float *)((half *)nram_pts_in_flag - nram_pts_num);
}
for (int i = 0; i < nram_pts_num; i++) {
nram_pts_idx_seq[i] = i;
}
int nram_pts_loop_times = pts_num / nram_pts_num;
int rem_nram_num = pts_num % nram_pts_num;
for (int roi_index = taskId; roi_index < boxes_num; roi_index += taskDim) {
const T *cur_roi = rois + roi_index * ROI_OFFSET;
T cx = cur_roi[0];
T cy = cur_roi[1];
T cz = cur_roi[2];
T dx = cur_roi[3];
T dy = cur_roi[4];
T dz = cur_roi[5];
T rz = cur_roi[6];
T dx_2 = dx / 2.0;
T dy_2 = dy / 2.0;
T dz_2 = dz / 2.0;
for (int loop_idx = 0; loop_idx <= nram_pts_loop_times; loop_idx++) {
int load_pts_num =
(loop_idx == nram_pts_loop_times) ? rem_nram_num : nram_pts_num;
if (load_pts_num == 0) {
break;
}
int pts_offset_cur_loop = nram_pts_num * loop_idx;
int compute_pts_num = (loop_idx == nram_pts_loop_times)
? PAD_UP(rem_nram_num, ALIGN_NUM)
: nram_pts_num;
// load pts
__memcpy((void *)X, (T *)pts + pts_offset_cur_loop,
load_pts_num * sizeof(T), GDRAM2NRAM);
__memcpy((void *)Y, (T *)pts + pts_num + pts_offset_cur_loop,
load_pts_num * sizeof(T), GDRAM2NRAM);
__memcpy((void *)Z, (T *)pts + pts_num * 2 + pts_offset_cur_loop,
load_pts_num * sizeof(T), GDRAM2NRAM);
// fabs(local_z)
__bang_sub_scalar((T *)local_Z, (T *)Z, (T)cz, compute_pts_num);
__bang_sub_scalar((T *)temp_buffer1, (T *)Z, (T)(cz + dz_2),
compute_pts_num);
__bang_active_abs((T *)temp_buffer1, (T *)temp_buffer1, compute_pts_num);
#if __BANG_ARCH__ >= 322
__bang_le_scalar((T *)nram_pts_in_flag, (T *)temp_buffer1, (T)(dz_2),
compute_pts_num);
#else
__bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dz_2));
__bang_le((T *)nram_pts_in_flag, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
#endif
T cosa = std::cos(-rz);
T sina = std::sin(-rz);
__bang_sub_scalar((T *)temp_buffer3, (T *)X, (T)cx, compute_pts_num);
__bang_sub_scalar((T *)temp_buffer4, (T *)Y, (T)cy, compute_pts_num);
__bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)cosa,
compute_pts_num);
__bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)sina,
compute_pts_num);
// local_x
__bang_sub((T *)local_X, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
// fabs(local_x)
__bang_active_abs((T *)temp_buffer1, (T *)local_X, compute_pts_num);
// fabs(local_x) < dx/2 ? 1 : 0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dx_2),
compute_pts_num);
#else
__bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dx_2));
__bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
#endif
__bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
(T *)temp_buffer1,
compute_pts_num); // flush res
__bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)sina,
compute_pts_num);
__bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)cosa,
compute_pts_num);
// local_y
__bang_add((T *)local_Y, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
// fabs(local_y)
__bang_active_abs((T *)temp_buffer1, (T *)local_Y, compute_pts_num);
// fabs(local_y) < dy/2 ? 1 : 0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dy_2),
compute_pts_num);
#else
__bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dy_2));
__bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
#endif
__bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
(T *)temp_buffer1,
compute_pts_num); // flush res
T x_res = dx / out_x;
T y_res = dy / out_y;
T z_res = dz / out_z;
__bang_add_scalar((T *)local_X, (T *)local_X, (T)(dx_2), compute_pts_num);
__bang_add_scalar((T *)local_Y, (T *)local_Y, (T)(dy_2), compute_pts_num);
// local_Z do not need to add dz/2.0
#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372)
__bang_div((T *)local_X, (T *)local_X, (T)x_res, compute_pts_num);
__bang_div((T *)local_Y, (T *)local_Y, (T)y_res, compute_pts_num);
__bang_div((T *)local_Z, (T *)local_Z, (T)z_res, compute_pts_num);
#else
__bang_mul_scalar((T *)local_X, (T *)local_X, (T)(1 / x_res),
compute_pts_num);
__bang_mul_scalar((T *)local_Y, (T *)local_Y, (T)(1 / y_res),
compute_pts_num);
__bang_mul_scalar((T *)local_Z, (T *)local_Z, (T)(1 / z_res),
compute_pts_num);
#endif
// float = float2int + int2float, half = half2int + int2float
if (sizeof(T) == sizeof(float)) {
#if __BANG_ARCH__ >= 322
__bang_float2int32_tz((int *)temp_buffer1, (float *)local_X,
compute_pts_num, 0);
__bang_float2int32_tz((int *)temp_buffer2, (float *)local_Y,
compute_pts_num, 0);
__bang_float2int32_tz((int *)temp_buffer3, (float *)local_Z,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
compute_pts_num, 0);
#else
convertFloat2Int((int *)temp_buffer1, (float *)temp_buffer2,
(float *)fp_local_X, (float *)temp_buffer3,
compute_pts_num);
convertFloat2Int((int *)temp_buffer2, (float *)temp_buffer3,
(float *)fp_local_Y, (float *)temp_buffer4,
compute_pts_num);
convertFloat2Int((int *)temp_buffer3, (float *)temp_buffer4,
(float *)fp_local_Z, (float *)temp_buffer5,
compute_pts_num);
convertInt2Float((float *)fp_local_X, (float *)temp_buffer4,
(int *)temp_buffer1, (float *)temp_buffer5,
compute_pts_num);
convertInt2Float((float *)fp_local_Y, (float *)temp_buffer4,
(int *)temp_buffer2, (float *)temp_buffer5,
compute_pts_num);
convertInt2Float((float *)fp_local_Z, (float *)temp_buffer4,
(int *)temp_buffer3, (float *)temp_buffer5,
compute_pts_num);
#endif
} else {
__bang_half2float((float *)temp_buffer4, (half *)nram_pts_in_flag,
compute_pts_num);
__bang_move((void *)fp_nram_pts_in_flag, (void *)temp_buffer4,
compute_pts_num * sizeof(float));
#if __BANG_ARCH__ >= 322
__bang_half2int32_tz((int *)temp_buffer1, (half *)local_X,
compute_pts_num, 0);
__bang_half2int32_tz((int *)temp_buffer2, (half *)local_Y,
compute_pts_num, 0);
__bang_half2int32_tz((int *)temp_buffer3, (half *)local_Z,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
compute_pts_num, 0);
#else
__bang_half2int16_tz((int16_t *)temp_buffer1, (half *)local_X,
compute_pts_num, 0);
__bang_half2int16_tz((int16_t *)temp_buffer2, (half *)local_Y,
compute_pts_num, 0);
__bang_half2int16_tz((int16_t *)temp_buffer3, (half *)local_Z,
compute_pts_num, 0);
__bang_int162float((float *)fp_local_X, (int16_t *)temp_buffer1,
compute_pts_num, 0);
__bang_int162float((float *)fp_local_Y, (int16_t *)temp_buffer2,
compute_pts_num, 0);
__bang_int162float((float *)fp_local_Z, (int16_t *)temp_buffer3,
compute_pts_num, 0);
#endif
}
// process index >= 0
__bang_write_value((float *)temp_buffer4, compute_pts_num, (float)0.0f);
__bang_maxequal((float *)fp_local_X, (float *)fp_local_X,
(float *)temp_buffer4, compute_pts_num);
__bang_maxequal((float *)fp_local_Y, (float *)fp_local_Y,
(float *)temp_buffer4, compute_pts_num);
__bang_maxequal((float *)fp_local_Z, (float *)fp_local_Z,
(float *)temp_buffer4, compute_pts_num);
// process index <= (out_x - 1)
__bang_write_value((float *)temp_buffer5, compute_pts_num,
(float)(out_x - 1));
__bang_minequal((float *)fp_local_X, (float *)fp_local_X,
(float *)temp_buffer5, compute_pts_num);
__bang_write_value((float *)temp_buffer5, compute_pts_num,
(float)(out_y - 1));
__bang_minequal((float *)fp_local_Y, (float *)fp_local_Y,
(float *)temp_buffer5, compute_pts_num);
__bang_write_value((float *)temp_buffer5, compute_pts_num,
(float)(out_z - 1));
__bang_minequal((float *)fp_local_Z, (float *)fp_local_Z,
(float *)temp_buffer5, compute_pts_num);
__bang_mul_scalar((float *)temp_buffer1, (float *)fp_local_X,
(float)(out_y * out_z), compute_pts_num);
__bang_mul_scalar((float *)temp_buffer2, (float *)fp_local_Y,
(float)out_z, compute_pts_num);
__bang_mul_scalar((float *)temp_buffer3, (float *)fp_local_Z, (float)1.0,
compute_pts_num);
__bang_add((float *)nram_voxel_offset, (float *)temp_buffer1,
(float *)temp_buffer2, compute_pts_num);
__bang_add((float *)nram_voxel_offset, (float *)nram_voxel_offset,
(float *)temp_buffer3, compute_pts_num);
__bang_mul_scalar((float *)nram_voxel_offset, (float *)nram_voxel_offset,
(float)max_pts_each_voxel, compute_pts_num);
if (compute_pts_num != load_pts_num) {
__memset_nram((float *)fp_nram_pts_in_flag + load_pts_num,
compute_pts_num - load_pts_num, (float)0.0);
}
__bang_collect((float *)temp_buffer4, (float *)nram_pts_idx_seq,
(float *)fp_nram_pts_in_flag, compute_pts_num);
int pts_num_in_cur_roi =
(int)__bang_count((float *)fp_nram_pts_in_flag, compute_pts_num);
int *pts_idx_cur_voxels =
(int *)pts_idx_of_voxels +
roi_index * out_x * out_y * out_z * max_pts_each_voxel;
for (int idx = 0; idx < pts_num_in_cur_roi; idx++) {
int cur_pts_idx = *((int *)temp_buffer4 + idx);
int offset = (int)(*((float *)nram_voxel_offset + cur_pts_idx));
int cnt = pts_idx_cur_voxels[offset];
if (cnt < max_pts_each_voxel - 1) {
pts_idx_cur_voxels[offset + cnt + 1] =
cur_pts_idx + loop_idx * nram_pts_num;
pts_idx_cur_voxels[offset]++;
}
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiawarePool3dForward(
const int pool_method, const int boxes_num, const int pts_num,
const int channels, const int max_pts_each_voxel, const int out_x,
const int out_y, const int out_z, const T *pts_feature,
const int *pts_idx_of_voxels, T *pooled_features, int *argmax) {
// params (T)pts_feature: (channels, pts_num)
// params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
// max_pts_each_voxel) params (int)argmax: (boxes_num, out_x, out_y, out_z,
// channels) params (T)pooled_features: (boxes_num, out_x, out_y, out_z,
// channels)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int align_num = NFU_ALIGN_SIZE / sizeof(T);
int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
int nram_channels_limit =
PAD_DOWN((MAX_NRAM_SIZE - 128 -
align_max_pts_each_voxel * (sizeof(int) + sizeof(T))) /
((align_max_pts_each_voxel + 1) * sizeof(T) + sizeof(int)),
align_num);
int *nram_pts_idx_cur_voxel = (int *)data_nram;
// nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
T *nram_max_pts_feature_tmp =
(T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
// nram_max_pts_feature_tmp [align_max_pts_each_voxel]
T *nram_pts_feature_in_voxel =
((T *)nram_max_pts_feature_tmp + align_max_pts_each_voxel);
// nram_pts_feature_in_voxel [nram_channels_limit, align_max_pts_each_voxel]
T *nram_pooled_features_cur_voxel =
((T *)nram_pts_feature_in_voxel +
nram_channels_limit * align_max_pts_each_voxel);
// nram_pooled_features_cur_voxel [nram_channels_limit]
int *nram_argmax_cur_voxel =
(int *)((T *)nram_pooled_features_cur_voxel + nram_channels_limit);
// nram_argmax_cur_voxel [nram_channels_limit]
char *one_pooled_feature =
(char *)((int *)nram_argmax_cur_voxel + nram_channels_limit);
// one_pooled_feature [128]
int channels_loop_times = channels / nram_channels_limit;
int rem_channels = channels % nram_channels_limit;
for (int voxel_index = taskId;
voxel_index < boxes_num * out_x * out_y * out_z;
voxel_index += taskDim) {
int *pts_idx_cur_voxels =
(int *)pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
__memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxels,
max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
int pts_num_cur_voxel = nram_pts_idx_cur_voxel[0];
if (pts_num_cur_voxel == 0) {
continue;
}
for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
channels_loop_idx++) {
int actual_channels_num = (channels_loop_idx == channels_loop_times)
? rem_channels
: nram_channels_limit;
if (actual_channels_num == 0) {
break;
}
int channels_offset = nram_channels_limit * channels_loop_idx;
#if ((__BANG_ARCH__ >= 200) && (__BANG_ARCH__ < 300))
int compute_channels_num = (channels_loop_idx == channels_loop_times)
? PAD_UP(rem_channels, align_num)
: nram_channels_limit;
if (pool_method == 0) {
__bang_write_value((void *)nram_pts_feature_in_voxel,
compute_channels_num * align_max_pts_each_voxel,
(T)-INFINITY);
}
#endif
T *pts_feature_cur_loop = (T *)pts_feature + channels_offset * pts_num;
for (int idx = 0; idx < pts_num_cur_voxel; idx++) {
__memcpy((T *)nram_pts_feature_in_voxel + idx,
(T *)pts_feature_cur_loop + nram_pts_idx_cur_voxel[idx + 1],
sizeof(T), GDRAM2NRAM, align_max_pts_each_voxel * sizeof(T),
pts_num * sizeof(T), actual_channels_num - 1);
}
for (int channel_idx = 0; channel_idx < actual_channels_num;
channel_idx++) {
if (pool_method == 0) {
#if __BANG_ARCH__ >= 322
__bang_argmax((T *)one_pooled_feature,
(T *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel,
pts_num_cur_voxel);
T max_val = ((T *)one_pooled_feature)[0];
int max_idx = (int)(*(uint32_t *)((T *)one_pooled_feature + 1));
nram_pooled_features_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? 0 : max_val;
nram_argmax_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
#else
// __bang_max need align num on mlu200 series
if (sizeof(T) == sizeof(float)) {
__bang_max((float *)one_pooled_feature,
(float *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel,
align_max_pts_each_voxel);
float max_val = ((float *)one_pooled_feature)[0];
__bang_write_value((void *)nram_max_pts_feature_tmp,
align_max_pts_each_voxel, (float)max_val);
__bang_eq((float *)nram_max_pts_feature_tmp,
(float *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel,
(float *)nram_max_pts_feature_tmp,
align_max_pts_each_voxel);
int max_idx = (int)__bang_findfirst1(
(float *)nram_max_pts_feature_tmp, align_max_pts_each_voxel);
nram_pooled_features_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? 0 : max_val;
nram_argmax_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? -1
: nram_pts_idx_cur_voxel[max_idx + 1];
} else {
int max_idx = -1;
float max_val = -INFINITY;
for (int k = 0; k < pts_num_cur_voxel; k++) {
float pts_feature_cur_channel = __half2float_rd(
*((half *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel + k));
if (pts_feature_cur_channel > max_val) {
max_val = pts_feature_cur_channel;
max_idx = k;
}
}
nram_pooled_features_cur_voxel[channel_idx] =
(max_idx == -1) ? 0 : max_val;
nram_argmax_cur_voxel[channel_idx] =
(max_idx == -1) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
}
#endif
} else if (pool_method == 1) {
float sum_val_cur_channel = 0;
for (int k = 0; k < pts_num_cur_voxel; k++) {
sum_val_cur_channel += static_cast<float>(
((T *)nram_pts_feature_in_voxel)[channel_idx *
align_max_pts_each_voxel +
k]);
}
nram_pooled_features_cur_voxel[channel_idx] =
(T)(sum_val_cur_channel / pts_num_cur_voxel);
}
}
// store
__memcpy((T *)pooled_features + voxel_index * channels + channels_offset,
(void *)nram_pooled_features_cur_voxel,
actual_channels_num * sizeof(T), NRAM2GDRAM);
if (pool_method == 0) {
__memcpy((int *)argmax + voxel_index * channels + channels_offset,
(void *)nram_argmax_cur_voxel,
actual_channels_num * sizeof(int), NRAM2GDRAM);
}
}
}
}
void KernelPtsIdxOfVoxels(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const int pool_method, const int boxes_num,
const int pts_num, const int max_pts_each_voxel,
const int out_x, const int out_y, const int out_z,
const void *rois, const void *pts,
int *pts_idx_of_voxels) {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelPtsIdxOfVoxels<float><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
out_z, (float *)rois, (float *)pts, (int *)pts_idx_of_voxels);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelPtsIdxOfVoxels<half><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
out_z, (half *)rois, (half *)pts, (int *)pts_idx_of_voxels);
}; break;
default: {
break;
}
}
}
void KernelRoiawarePool3dForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
const int pts_num, const int channels, const int max_pts_each_voxel,
const int out_x, const int out_y, const int out_z, const void *pts_feature,
const int *pts_idx_of_voxels, void *pooled_features, int *argmax) {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelRoiawarePool3dForward<float><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
out_y, out_z, (float *)pts_feature, (int *)pts_idx_of_voxels,
(float *)pooled_features, (int *)argmax);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiawarePool3dForward<half><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
out_y, out_z, (half *)pts_feature, (int *)pts_idx_of_voxels,
(half *)pooled_features, (int *)argmax);
}; break;
default: {
break;
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiawareMaxPool3dBackward(
const int boxes_num, const int out_x, const int out_y, const int out_z,
const int channels, const int *argmax, const T *grad_out, T *grad_in) {
// params (int)argmax: (boxes_num, out_x, out_y, out_z, channels)
// params (T)grad_out: (boxes_num, out_x, out_y, out_z, channels)
// params (T)grad_in: (pts_num, channels)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int nram_channels_limit =
(MAX_NRAM_SIZE - sizeof(T) * 1) / (sizeof(T) + sizeof(int));
int *nram_argmax_cur_loop = (int *)data_nram;
// nram_argmax_cur_loop [nram_channels_limit]
T *nram_grad_out_cur_loop =
(T *)((int *)nram_argmax_cur_loop + nram_channels_limit);
// nram_grad_out_cur_loop [nram_channels_limit]
T *nram_grad_in_cur_channel =
(T *)nram_grad_out_cur_loop + nram_channels_limit;
// nram_grad_in_cur_channel [1]
int channels_loop_times = channels / nram_channels_limit;
int rem_channels = channels % nram_channels_limit;
int voxels_num = boxes_num * out_x * out_y * out_z;
for (int voxel_index = taskId; voxel_index < voxels_num;
voxel_index += taskDim) {
const int *argmax_cur_voxel = argmax + voxel_index * channels;
const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
channels_loop_idx++) {
int actual_channels_num = (channels_loop_idx == channels_loop_times)
? rem_channels
: nram_channels_limit;
if (actual_channels_num == 0) {
break;
}
const int *argmax_cur_loop =
argmax_cur_voxel + nram_channels_limit * channels_loop_idx;
const T *grad_out_cur_loop =
grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
__memcpy((void *)nram_argmax_cur_loop, (void *)argmax_cur_loop,
actual_channels_num * sizeof(int), GDRAM2NRAM);
__memcpy((void *)nram_grad_out_cur_loop, (void *)grad_out_cur_loop,
actual_channels_num * sizeof(T), GDRAM2NRAM);
for (int channel_idx = 0; channel_idx < actual_channels_num;
channel_idx++) {
int *nram_argmax_cur_channel = nram_argmax_cur_loop + channel_idx;
T *nram_grad_out_cur_channel = nram_grad_out_cur_loop + channel_idx;
if (nram_argmax_cur_channel[0] == -1) {
continue;
}
T *grad_in_cur_channel =
grad_in + nram_argmax_cur_channel[0] * channels +
nram_channels_limit * channels_loop_idx + channel_idx;
__bang_atomic_add((T *)nram_grad_in_cur_channel,
(T *)grad_in_cur_channel,
(T *)(nram_grad_out_cur_channel), 1);
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiawareAvgPool3dBackward(
const int boxes_num, const int out_x, const int out_y, const int out_z,
const int channels, const int max_pts_each_voxel,
const int *pts_idx_of_voxels, const T *grad_out, T *grad_in) {
// params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
// max_pts_each_voxel) params (T)grad_out: (boxes_num, out_x, out_y, out_z,
// channels) params (T)grad_in: (pts_num, channels)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int align_num = NFU_ALIGN_SIZE / sizeof(T);
int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
int nram_channels_limit = PAD_DOWN(
(MAX_NRAM_SIZE - align_max_pts_each_voxel * sizeof(int)) / 2 / sizeof(T),
align_num);
int *nram_pts_idx_cur_voxel = (int *)data_nram;
// nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
T *nram_grad_out_cur_loop =
(T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
// nram_grad_out_cur_loop [nram_channels_limit]
T *nram_grad_in_cur_loop = (T *)nram_grad_out_cur_loop + nram_channels_limit;
// nram_grad_in_cur_loop [nram_channels_limit]
int channels_loop_times = channels / nram_channels_limit;
int rem_channels = channels % nram_channels_limit;
int voxels_num = boxes_num * out_x * out_y * out_z;
for (int voxel_index = taskId; voxel_index < voxels_num;
voxel_index += taskDim) {
const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
const int *pts_idx_cur_voxel =
pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
__memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxel,
max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
int total_pts_of_voxel = nram_pts_idx_cur_voxel[0];
if (total_pts_of_voxel <= 0) {
continue;
}
float cur_grad = 1.0 / ((float)total_pts_of_voxel);
for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
channels_loop_idx++) {
int actual_channels_num = (channels_loop_idx == channels_loop_times)
? rem_channels
: nram_channels_limit;
if (actual_channels_num == 0) {
break;
}
const T *grad_out_cur_loop =
grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
__memcpy((void *)nram_grad_in_cur_loop, (void *)grad_out_cur_loop,
actual_channels_num * sizeof(T), GDRAM2NRAM);
int align_actual_channels_num = PAD_UP(actual_channels_num, align_num);
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)nram_grad_out_cur_loop,
(half *)nram_grad_in_cur_loop,
align_actual_channels_num);
__bang_mul_scalar((float *)nram_grad_out_cur_loop,
(float *)nram_grad_out_cur_loop, (float)cur_grad,
align_actual_channels_num);
convertFloat2half((half *)nram_grad_out_cur_loop,
(float *)nram_grad_out_cur_loop,
align_actual_channels_num);
} else {
__bang_mul_scalar((float *)nram_grad_out_cur_loop,
(float *)nram_grad_in_cur_loop, (float)cur_grad,
align_actual_channels_num);
}
for (int k = 1; k <= total_pts_of_voxel; k++) {
T *grad_in_cur_loop = grad_in + nram_pts_idx_cur_voxel[k] * channels +
nram_channels_limit * channels_loop_idx;
__bang_atomic_add((T *)nram_grad_in_cur_loop, (T *)grad_in_cur_loop,
(T *)nram_grad_out_cur_loop, actual_channels_num);
}
}
}
}
void KernelRoiawarePool3dBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
const int out_x, const int out_y, const int out_z, const int channels,
const int max_pts_each_voxel, const int *pts_idx_of_voxels,
const int *argmax, const void *grad_out, void *grad_in) {
if (pool_method == 0) {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelRoiawareMaxPool3dBackward<float>
<<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
(int *)argmax, (float *)grad_out,
(float *)grad_in);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiawareMaxPool3dBackward<half>
<<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
(int *)argmax, (half *)grad_out,
(half *)grad_in);
}; break;
default: {
break;
}
}
} else {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelRoiawareAvgPool3dBackward<float>
<<<k_dim, k_type, queue>>>(
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
(int *)pts_idx_of_voxels, (float *)grad_out, (float *)grad_in);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiawareAvgPool3dBackward<half>
<<<k_dim, k_type, queue>>>(
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
(int *)pts_idx_of_voxels, (half *)grad_out, (half *)grad_in);
}; break;
default: {
break;
}
}
}
}
mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
/*************************************************************************
*
* NRAM partition:
* | boxes3d | ping points + pong points | aux_a ~ aux_f |
* | 7 * sizeof(T) | 6 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
*
*************************************************************************/
#define TWELVE_SPLIT 12
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
const size_t deal_num,
T *x,
T *y,
T *z,
T *auxiliary_a,
T *auxiliary_b,
T *auxiliary_c,
T *auxiliary_d,
T *auxiliary_e,
T *auxiliary_f,
T *pts_assign) {
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
T cx = boxes3d[0];
T cy = boxes3d[1];
T cz = boxes3d[2];
T dx = boxes3d[3];
T dy = boxes3d[4];
T dz = boxes3d[5];
T rz = boxes3d[6];
// shift to the center since cz in box3d is the bottom center
cz += 0.5 * dz;
T cosa = (T)std::cos(-rz);
T sina = (T)std::sin(-rz);
// x - cx
__bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
// y - cy
__bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
// z - cz
__bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
// |z - cz|
__bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// |z - cz| > dz / 2.0
#if __BANG_ARCH__ >= 322
__bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
#else
__bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
__bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
#endif
// !(|z - cz| > dz / 2.0)
__bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// (x - cx) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
// (y - cy) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
// local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
__bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
// |local_x|
__bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
// |local_x| < dx / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
#else
__bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
__bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
#endif
// (x - cx) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
// (y - cy) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
// local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
__bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
// |local_y|
__bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
// |local_y| < dy / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
#else
__bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
__bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
#endif
// pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
__bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
__bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
}
template <typename T>
__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (*cnt >= sampled_pts_num) {
return;
}
checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
(T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
(T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (select_num == 0) {
return;
}
int sampled_pts_num_rem = sampled_pts_num - *cnt;
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(
pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
*cnt += select_num;
}
template <typename T>
__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
const size_t auxiliary_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (*cnt >= sampled_pts_num) {
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
return;
}
checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
(T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
(T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (*cnt + select_num == 0) {
// pooled_empty_flag_gdram set 1
*((int *)auxiliary_a) = 1;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
// pooled_features_gdram set 0
int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
// use auxiliary_a to auxiliary_f
__bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
if (repeat > 0) {
__memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram +
box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
repeat * auxiliary_num_deal * 6 * sizeof(T),
auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
}
return;
}
if (select_num > 0) {
int sampled_pts_num_rem = sampled_pts_num - *cnt;
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
}
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
*cnt += select_num;
if (*cnt < sampled_pts_num) {
// duplicate same points for sampling
int repeat = sampled_pts_num / (*cnt) - 1;
int rem = sampled_pts_num % (*cnt);
if (repeat > 0) {
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
(*cnt) * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
(*cnt) * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(
pooled_features_gdram +
(box_idx * sampled_pts_num + (repeat + 1) * (*cnt)) * (3 + feature_in_len) *
sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
if (coreId == 0x80) {
return;
}
size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
size_t boxes_rem = (batch_size * boxes_num) % taskDim;
// calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
int32_t batch_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) / boxes_num :
(taskId * boxes_per_core + boxes_rem) / boxes_num;
int32_t batch_end = taskId < boxes_rem ?
((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
size_t last_batch_box_end = taskId < boxes_rem ?
(taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
// points_xyz : [3, B, N]
const char *points_x_gdram = points_xyz_gdram;
const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
size_t boxes3d_size = PAD_UP(7, NFU_ALIGN_SIZE) * sizeof(T);
size_t span_num_deal = PAD_DOWN(MAX_NRAM_SIZE / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
size_t align_num = NFU_ALIGN_SIZE;
int32_t repeat = pts_num / span_num_deal;
size_t rem = pts_num % span_num_deal;
size_t align_rem = CEIL_ALIGN(rem, align_num);
char *boxes3d = nram_buffer;
char *ping_points_x = nram_buffer + boxes3d_size;
char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
size_t span_load_input1_size = span_num_deal * sizeof(T);
size_t span_load_input2_size = span_num_deal * sizeof(T);
size_t span_load_input3_size = span_num_deal * sizeof(T);
size_t span_load_input4_size = span_num_deal * sizeof(T);
int cnt = 0;
for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
const char *point_features_start =
point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
char *pooled_features_start =
pooled_features_gdram +
(bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
__memcpy_async(boxes3d,
boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T) + box_idx * 7 * sizeof(T),
7 * sizeof(T), GDRAM2NRAM);
cnt = 0;
if (repeat > 0) {
__memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
__memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
__memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
__asm__ volatile("sync;");
}
for (int i = 0; i < repeat - 1; i++) {
__memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
GDRAM2NRAM);
__memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
GDRAM2NRAM);
__memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
GDRAM2NRAM);
computeStoreRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + (i % 2) * ping_pong_gap,
ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
__asm__ volatile("sync;");
}
if (rem > 0) {
if (sizeof(T) == sizeof(float)) {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
} else {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
}
__memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
points_x_start + repeat * span_load_input1_size, rem * sizeof(T),
GDRAM2NRAM);
__memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
points_y_start + repeat * span_load_input2_size, rem * sizeof(T),
GDRAM2NRAM);
__memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
points_z_start + repeat * span_load_input3_size, rem * sizeof(T),
GDRAM2NRAM);
}
if (repeat > 0 && rem > 0) {
computeStoreRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
} else if (repeat > 0 && rem == 0) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
if (rem > 0) {
__asm__ volatile("sync;");
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
ping_points_y + (repeat % 2) * ping_pong_gap,
ping_points_z + (repeat % 2) * ping_pong_gap,
point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
}
}
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
void KernelRoiPointPool3dLargeBoxesNumForward(cnrtDim3_t k_dim,
cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const void *points_xyz,
const void *boxes3d,
const void *point_features,
void *pooled_features,
int *pooled_empty_flag) {
switch (d_type) {
default: { break; }
case CNRT_FLOAT32: {
MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
}
}
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
/**************************************************************************************
*
* NRAM partition:
* | boxes3d | cnt |
* | boxes_num * 7 * sizeof(T) | boxes_num * sizeof(int) |
*
* | ping points | pong points | aux_a ~ aux_f |
* | 3 * deal_num * sizeof(T) | 3 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
*
***************************************************************************************/
#define TWELVE_SPLIT 12
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
const size_t deal_num,
T *x,
T *y,
T *z,
T *auxiliary_a,
T *auxiliary_b,
T *auxiliary_c,
T *auxiliary_d,
T *auxiliary_e,
T *auxiliary_f,
T *pts_assign) {
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
T cx = boxes3d[0];
T cy = boxes3d[1];
T cz = boxes3d[2];
T dx = boxes3d[3];
T dy = boxes3d[4];
T dz = boxes3d[5];
T rz = boxes3d[6];
// shift to the center since cz in box3d is the bottom center
cz += 0.5 * dz;
T cosa = (T)std::cos(-rz);
T sina = (T)std::sin(-rz);
// x - cx
__bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
// y - cy
__bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
// z - cz
__bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
// |z - cz|
__bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// |z - cz| > dz / 2.0
#if __BANG_ARCH__ >= 322
__bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
#else
__bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
__bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
#endif
// !(|z - cz| > dz / 2.0)
__bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// (x - cx) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
// (y - cy) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
// local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
__bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
// |local_x|
__bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
// |local_x| < dx / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
#else
__bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
__bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
#endif
// (x - cx) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
// (y - cy) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
// local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
__bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
// |local_y|
__bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
// |local_y| < dy / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
#else
__bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
__bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
#endif
// pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
__bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
__bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
}
template <typename T>
__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (cnt[box_idx] >= sampled_pts_num) {
return;
}
checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
(T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
(T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
(T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (select_num == 0) {
return;
}
int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
cnt[box_idx] += select_num;
}
template <typename T>
__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
const size_t auxiliary_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (cnt[box_idx] >= sampled_pts_num) {
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
return;
}
checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
(T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
(T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
(T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (cnt[box_idx] + select_num == 0) {
// pooled_empty_flag_gdram set 1
*((int *)auxiliary_a) = 1;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
// pooled_features_gdram set 0
int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
// use auxiliary_a to auxiliary_f
__bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
if (repeat > 0) {
__memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram +
box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
repeat * auxiliary_num_deal * 6 * sizeof(T),
auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
}
return;
}
if (select_num > 0) {
int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
}
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
cnt[box_idx] += select_num;
if (cnt[box_idx] < sampled_pts_num) {
// duplicate same points for sampling
int repeat = sampled_pts_num / cnt[box_idx] - 1;
int rem = sampled_pts_num % cnt[box_idx];
if (repeat > 0) {
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
cnt[box_idx] * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
cnt[box_idx] * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + (repeat + 1) * cnt[box_idx]) *
(3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiPointPool3dForward(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
if (coreId == 0x80) {
return;
}
size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
size_t boxes_rem = (batch_size * boxes_num) % taskDim;
// calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
int32_t batch_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) / boxes_num :
(taskId * boxes_per_core + boxes_rem) / boxes_num;
int32_t batch_end = taskId < boxes_rem ?
((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
size_t last_batch_box_end = taskId < boxes_rem ?
(taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
// points_xyz : [3, B, N]
const char *points_x_gdram = points_xyz_gdram;
const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
size_t boxes3d_size = PAD_UP(boxes_num * 7, NFU_ALIGN_SIZE) * sizeof(T);
size_t cnt_size = PAD_UP(boxes_num, NFU_ALIGN_SIZE) * sizeof(int);
size_t span_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - boxes3d_size - cnt_size) / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
size_t align_num = NFU_ALIGN_SIZE;
int32_t repeat = pts_num / span_num_deal;
size_t rem = pts_num % span_num_deal;
size_t align_rem = CEIL_ALIGN(rem, align_num);
char *boxes3d = nram_buffer;
char *cnt = nram_buffer + boxes3d_size;
char *ping_points_x = cnt + cnt_size;
char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
size_t span_load_input1_size = span_num_deal * sizeof(T);
size_t span_load_input2_size = span_num_deal * sizeof(T);
size_t span_load_input3_size = span_num_deal * sizeof(T);
size_t span_load_input4_size = span_num_deal * sizeof(T);
for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
__memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T),
boxes_num * 7 * sizeof(T), GDRAM2NRAM);
__bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE));
const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
const char *point_features_start =
point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
char *pooled_features_start =
pooled_features_gdram +
(bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
if (repeat > 0) {
__memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
__memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
__memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
__asm__ volatile("sync;");
}
for (int i = 0; i < repeat - 1; i++) {
__memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
GDRAM2NRAM);
__memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
GDRAM2NRAM);
__memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
GDRAM2NRAM);
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + (i % 2) * ping_pong_gap,
ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
}
__asm__ volatile("sync;");
}
if (rem > 0) {
if (sizeof(T) == sizeof(float)) {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
} else {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
}
__memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
points_x_start + repeat * span_load_input1_size, rem * sizeof(T), GDRAM2NRAM);
__memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
points_y_start + repeat * span_load_input2_size, rem * sizeof(T), GDRAM2NRAM);
__memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
points_z_start + repeat * span_load_input3_size, rem * sizeof(T), GDRAM2NRAM);
}
if (repeat > 0 && rem > 0) {
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
}
} else if (repeat > 0 && rem == 0) {
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
if (rem > 0) {
__asm__ volatile("sync;");
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
ping_points_y + (repeat % 2) * ping_pong_gap,
ping_points_z + (repeat % 2) * ping_pong_gap,
point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
}
}
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<float>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<half>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
void KernelRoiPointPool3dForward(cnrtDim3_t k_dim,
cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const void *points_xyz,
const void *boxes3d,
const void *point_features,
void *pooled_features,
int *pooled_empty_flag) {
switch (d_type) {
default: { break; }
case CNRT_FLOAT32: {
MLUUnion1KernelRoiPointPool3dForward<float><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiPointPool3dForward<half><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
}
}
mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include <algorithm>
__nram__ char nram_buffer[MAX_NRAM_SIZE];
#if __BANG_ARCH__ >= 322
/**
* returns the index of ret, which is stored at the 1st position of the `ret`,
* used after bang_min
*/
__mlu_func__ uint32_t getIndice(half *ret) {
uint32_t indice = *((uint32_t *)((uint16_t *)ret + 1));
return indice;
}
/**
* returns the index of ret, which is stored at the 1st position of the `ret`,
* used after bang_min
*/
__mlu_func__ uint32_t getIndice(float *ret) {
uint32_t indice = ((uint32_t *)ret)[1];
return indice;
}
#endif
template <typename T>
__mlu_func__ void auxArgmin(T *nram_dst, T *nram_src, const int num_deal,
T *value, int *index) {
__bang_min(nram_dst, nram_src, num_deal);
*value = nram_dst[0];
__bang_write_value(nram_dst, num_deal, *value);
__bang_eq(nram_dst, nram_src, nram_dst, num_deal);
__bang_findfirst1((uint32_t *)nram_dst, nram_dst, num_deal);
*index = *((int *)nram_dst);
}
template <typename T>
__mlu_func__ void auxFuncFind3Min(T *nram_aux_a, const int auxa_offset,
int *nram_aux_b, const int auxb_offset,
T *nram_dest, T *nram_aux_sort_a,
int *nram_aux_sort_b, const int deal_offset) {
__bang_write_value(nram_aux_sort_a, auxa_offset, (T)(INFINITY));
__bang_write_value(nram_aux_sort_b, auxb_offset, (int)0);
int index = 0;
for (int i = 0; i < 3; i++) {
#if __BANG_ARCH__ >= 322
__bang_argmin(nram_dest, nram_aux_a, auxa_offset);
nram_aux_sort_a[i] = nram_dest[0];
index = getIndice(nram_dest);
#else
T value = 0;
auxArgmin(nram_dest, nram_aux_a, auxa_offset, &value, &index);
nram_aux_sort_a[i] = value;
#endif
nram_aux_sort_b[i] = nram_aux_b[index];
__memset_nram(nram_aux_a + index, 1, (T)(INFINITY));
}
__memcpy((char *)nram_aux_a, (char *)nram_aux_sort_a, auxa_offset * sizeof(T),
NRAM2NRAM);
__memcpy((char *)nram_aux_b, (char *)nram_aux_sort_b,
auxb_offset * sizeof(int), NRAM2NRAM);
}
template <typename T>
__mlu_func__ void auxFuncSort(T *nram_aux_a, const int auxa_offset,
int *nram_aux_b, const int auxb_offset,
T *nram_dest, T *nram_help_value,
int *nram_help_idx, const int num_deal,
const int deal_offset) {
for (int k = 0; k < num_deal; ++k) {
auxFuncFind3Min(nram_aux_a + k * auxa_offset, auxa_offset,
nram_aux_b + k * auxb_offset, auxb_offset, nram_dest,
nram_help_value, nram_help_idx, deal_offset);
}
}
template <typename T>
__mlu_func__ void auxFuncNN(
size_t *output_aux_sort_a_gap, size_t *output_aux_sort_b_gap,
size_t *output_aux_dest_gap, size_t *output_unknown_gap,
size_t *output_known_gap, size_t *output_dist_gap, size_t *auxillary_a_gap,
size_t *auxillary_b_gap, size_t *known_num_deal, size_t *unknown_num_deal,
size_t *align_num, size_t *auxa_offset, size_t *auxb_offset) {
/*
* nram partition:
* |-NFU_ALIGN_SIZE-|-2*NFU_ALIGN_SIZE-|-X*3*sizeof(T)-|
* space: | aux_sort_a | aux_sort_b | nram_unknown |
*
* | ------ (Y * 7 *sizeof(T)) ---------------- |
* | nram_known | nram_dist | nram_dest |
*
* | -X * NFU_ALIGN_SIZE ---|---X * 2 * NFU_ALIGN_SIZE-|
* | output_dist(aux_a) | output_dist(aux_b) |
* 200 series
* X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (2/3) / (3 * sizeof(T) + 3 *
* NFU_ALIGN_SIZE)
* Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (1/3) / (7 * sizeof(T))
* 300 series
* X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (4/5) / (3 *
* sizeof(T) + 3 * NFU_ALIGN_SIZE)
* Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) *
* (1/5) / (7 * sizeof(T))
*
*/
*align_num = NFU_ALIGN_SIZE / sizeof(T);
*auxa_offset = NFU_ALIGN_SIZE / sizeof(T);
*auxb_offset = 2 * NFU_ALIGN_SIZE / sizeof(int);
#if __BANG_ARCH__ >= 322
*known_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 / (7 * sizeof(T)), *align_num);
*unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 * 4 /
(3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
*align_num);
#else
*known_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 / (7 * sizeof(T)), *align_num);
*unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 * 2 /
(3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
*align_num);
#endif
*output_aux_sort_a_gap = 0;
*output_aux_sort_b_gap = *output_aux_sort_a_gap + NFU_ALIGN_SIZE;
*output_aux_dest_gap = *output_aux_sort_b_gap + 2 * NFU_ALIGN_SIZE;
*output_unknown_gap = *output_aux_dest_gap + *known_num_deal * sizeof(T);
*output_known_gap = *output_unknown_gap + *unknown_num_deal * 3 * sizeof(T);
*output_dist_gap = *output_known_gap + *known_num_deal * 3 * sizeof(T);
*auxillary_a_gap = *output_dist_gap + *known_num_deal * 3 * sizeof(T);
*auxillary_b_gap = *auxillary_a_gap + *unknown_num_deal * NFU_ALIGN_SIZE;
}
#if __BANG_ARCH__ >= 322
template <typename T>
__mlu_func__ bool containNanInf(T *nram_unknown) {
if (std::isnan(nram_unknown[0]) || std::isnan(nram_unknown[1]) ||
std::isnan(nram_unknown[2]) || std::isinf(nram_unknown[0]) ||
std::isinf(nram_unknown[1]) || std::isinf(nram_unknown[2]))
return true;
else
return false;
}
#endif
template <typename T>
__mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist,
T *nram_dest, T *nram_aux_a,
T *nram_aux_sort_a, int *nram_aux_b,
int *nram_aux_sort_b, const int known_num_deal,
const int known_seg_num, const int deal_offset,
const int known_count,
const int known_count_align) {
__bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
#if __BANG_ARCH__ >= 322
if (!containNanInf(nram_unknown)) {
#endif
// x1 - x2
__bang_sub_scalar(nram_dist, nram_known, nram_unknown[0],
known_count_align);
// y1 - y2
__bang_sub_scalar(nram_dist + known_count_align,
nram_known + known_count_align, nram_unknown[1],
known_count_align);
// z1 - z2
__bang_sub_scalar(nram_dist + 2 * known_count_align,
nram_known + 2 * known_count_align, nram_unknown[2],
known_count_align);
__bang_square(nram_dist, nram_dist, 3 * known_count_align);
__bang_add(nram_dist, nram_dist, nram_dist + known_count_align,
known_count_align);
__bang_add(nram_dist, nram_dist, nram_dist + 2 * known_count_align,
known_count_align);
#if __BANG_ARCH__ >= 322
}
#endif
int index = 0;
for (int i = 0; i < 3; i++) {
#if __BANG_ARCH__ >= 322
__bang_argmin(nram_dest, nram_dist, known_count_align);
nram_aux_a[i + deal_offset] = nram_dest[0];
index = getIndice(nram_dest);
#else
T value = 0;
auxArgmin(nram_dest, nram_dist, known_count_align, &value, &index);
nram_aux_a[i + deal_offset] = value;
#endif
nram_aux_b[i + deal_offset] = index + known_seg_num * known_num_deal;
__memset_nram(nram_dist + index, 1, (T)(INFINITY));
}
}
template <typename T>
__mlu_func__ void loadTransposedKnownTensor(
char *nram_known, char *nram_dist, const char *known_gdram,
const int known_num_deal, const int batch_id, const int m,
const int known_seg_num, const int count, const int count_align_num) {
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
#if __BANG_ARCH__ >= 322
__bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
__memcpy(nram_dist,
known_gdram +
(batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
m * sizeof(T), 2);
__bang_minequal((T *)nram_known, (T *)nram_known, (T *)nram_dist,
3 * count_align_num);
#else
__memcpy(nram_known,
known_gdram +
(batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
m * sizeof(T), 2);
#endif
}
template <typename T>
__mlu_func__ void loadUnknownTensor(char *nram_unknown,
const char *unknown_gdram,
const int unknown_num_deal,
const int unknown_seg_num, const int count,
const int count_align_num) {
__memcpy(nram_unknown,
unknown_gdram + unknown_seg_num * unknown_num_deal * 3 * sizeof(T),
count * 3 * sizeof(T), GDRAM2NRAM);
}
template <typename T>
__mlu_func__ void auxProcessSegment(
const int m, const int n, T *nram_unknown, T *nram_known, T *nram_dist,
T *nram_dest, T *known_gdram, T *nram_aux_a, const int auxa_offset,
int *nram_aux_b, const int auxb_offset, T *nram_aux_sort_a,
int *nram_aux_sort_b, const int unknown_num_deal, const int known_num_deal,
const int known_seg_num, const int unknown_seg_num, const int unknown_count,
const int known_count, const int known_count_align, const int start_idx,
int *deal_offset) {
int pre_batch_id = -1;
int cur_batch_id = -1;
pre_batch_id = start_idx / n;
// if aux_a space is not enough, get the first 3 min among aux_a and clear.
if (*deal_offset >= PAD_DOWN(auxa_offset, 3)) {
auxFuncSort(nram_aux_a, auxa_offset, nram_aux_b, auxb_offset, nram_dest,
nram_aux_sort_a, nram_aux_sort_b, unknown_count, *deal_offset);
*deal_offset = 3;
}
// load i'th segment of known batch data.
loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
(char *)known_gdram, known_num_deal,
pre_batch_id, m, known_seg_num, known_count,
known_count_align);
for (int k = 0; k < unknown_count; ++k) {
cur_batch_id = (start_idx + k) / n;
if (cur_batch_id != pre_batch_id) { // if batch id of unknown data changed,
// load corresponding known batch data
pre_batch_id = cur_batch_id;
loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
(char *)known_gdram, known_num_deal,
pre_batch_id, m, known_seg_num, known_count,
known_count_align);
}
computeThreeNN(nram_unknown + 3 * k, nram_known, nram_dist, nram_dest,
nram_aux_a + k * auxa_offset, nram_aux_sort_a,
nram_aux_b + k * auxb_offset, nram_aux_sort_b,
known_num_deal, known_seg_num, *deal_offset, known_count,
known_count_align);
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelThreeNN(const int b, const int n,
const int m, char *unknown_gdram,
char *known_gdram, char *dist2_gdram,
int *idx_gdram) {
if (coreId == 0x80) {
return;
}
size_t output_aux_sort_a_gap = 0, output_aux_sort_b_gap = 0,
output_dest_gap = 0, output_unknown_gap = 0, output_known_gap = 0,
output_dist_gap = 0, auxillary_a_gap = 0, auxillary_b_gap = 0,
known_num_deal = 0, unknown_num_deal = 0, align_num = 0,
auxa_offset = 0, auxb_offset = 0;
auxFuncNN<T>(&output_aux_sort_a_gap, &output_aux_sort_b_gap, &output_dest_gap,
&output_unknown_gap, &output_known_gap, &output_dist_gap,
&auxillary_a_gap, &auxillary_b_gap, &known_num_deal,
&unknown_num_deal, &align_num, &auxa_offset, &auxb_offset);
int num_per_core = b * n / taskDim;
const int core_offset = num_per_core;
char *unknown_gdram_start =
unknown_gdram + taskId * 3 * core_offset * sizeof(T);
char *known_gdram_start = known_gdram;
char *output_dist_start = dist2_gdram + taskId * 3 * core_offset * sizeof(T);
int *output_idx_start = idx_gdram + taskId * 3 * core_offset;
const int rem = (b * n) % taskDim;
if (taskId == taskDim - 1) {
num_per_core += rem;
}
const int unknown_repeat =
num_per_core / unknown_num_deal; // if unknown number is big, process it
// by unknown_repeat times.
const int unknown_rem = num_per_core % unknown_num_deal; // unknown reminder
const int unknown_rem_align = PAD_UP(unknown_rem, align_num);
const int known_repeat =
m / known_num_deal; // if known number is big, process it by
// unknown_repeat times.
const int known_rem = m % known_num_deal; // known reminder
const int known_rem_align = PAD_UP(known_rem, align_num);
char *nram_aux_sort_a = nram_buffer;
int *nram_aux_sort_b = (int *)(nram_buffer + output_aux_sort_b_gap);
char *nram_dest = nram_buffer + output_dest_gap;
char *nram_unknown = nram_buffer + output_unknown_gap;
char *nram_known = nram_buffer + output_known_gap;
char *nram_dist = nram_buffer + output_dist_gap;
char *nram_aux_a = nram_buffer + auxillary_a_gap;
int *nram_aux_b = (int *)(nram_buffer + auxillary_b_gap);
int deal_offset = 0;
int start_idx = -1;
for (int j = 0; j < unknown_repeat;
++j) { // process data within a unknown_repeat
// if unknown need to be process segmentally, use a aux_a and aux_b
// space to find first 3 minimum dist.
__bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
(T)(INFINITY));
__bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal, j,
unknown_num_deal, unknown_num_deal);
deal_offset = 0;
start_idx = taskId * core_offset + j * unknown_num_deal;
for (int i = 0; i < known_repeat;
++i) { // process known data in segmentally.
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, i, j, unknown_num_deal,
known_num_deal, known_num_deal, start_idx, &deal_offset);
deal_offset += 3;
}
if (known_rem > 0) { // process known rem
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, known_repeat, j, unknown_num_deal,
known_rem, known_rem_align, start_idx, &deal_offset);
}
deal_offset += 3;
if (deal_offset > 3) {
auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
(T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, deal_offset);
deal_offset = 0;
}
__memcpy((char *)output_dist_start + j * unknown_num_deal * 3 * sizeof(T),
(char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
auxa_offset * sizeof(T), unknown_num_deal - 1);
__memcpy((char *)output_idx_start + j * unknown_num_deal * 3 * sizeof(int),
(char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
auxb_offset * sizeof(int), unknown_num_deal - 1);
}
if (unknown_rem > 0) { // process unknown rem
deal_offset = 0;
__bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
(T)(INFINITY));
__bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal,
unknown_repeat, unknown_rem, unknown_rem_align);
start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
for (int i = 0; i < known_repeat; ++i) {
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, i, unknown_repeat, unknown_rem,
known_num_deal, known_num_deal, start_idx, &deal_offset);
deal_offset += 3;
}
if (known_rem > 0) {
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, known_repeat, unknown_repeat,
unknown_rem, known_rem, known_rem_align, start_idx, &deal_offset);
deal_offset += 3;
}
if (deal_offset > 3) {
auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
(T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_rem, deal_offset);
deal_offset = 0;
}
__memcpy((char *)output_dist_start +
unknown_repeat * unknown_num_deal * 3 * sizeof(T),
(char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
auxa_offset * sizeof(T), unknown_rem - 1);
__memcpy((char *)output_idx_start +
unknown_repeat * unknown_num_deal * 3 * sizeof(int),
(char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
auxb_offset * sizeof(int), unknown_rem - 1);
}
}
template __mlu_global__ void MLUUnion1KernelThreeNN<float>(
const int b, const int n, const int m, char *unknown_gdram,
char *known_gdram, char *dist2_gdram, int *idx_gdram);
template __mlu_global__ void MLUUnion1KernelThreeNN<half>(
const int b, const int n, const int m, char *unknown_gdram,
char *known_gdram, char *dist2_gdram, int *idx_gdram);
void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *unknown, const void *known, void *dist2,
int *idx, const int b, const int n, const int m) {
switch (data_type) {
case CNRT_FLOAT16: {
MLUUnion1KernelThreeNN<half><<<k_dim, k_type, queue>>>(
b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
}; break;
case CNRT_FLOAT32: {
MLUUnion1KernelThreeNN<float><<<k_dim, k_type, queue>>>(
b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
}; break;
default: {
break;
}
}
}
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
__nram__ char data_nram[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void mluMultiKernelTinShift(
const T *input, const int *shifts, T *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel) {
for (int cur_channel_index = taskId;
cur_channel_index < batch_size * channel_size;
cur_channel_index += taskDim) {
int n_index = cur_channel_index / channel_size;
int group_id = cur_channel_index % channel_size / group_channel;
int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size;
__bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
} else {
if (t_shift > 0) {
__memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
} else {
__memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
}
}
__asm__ volatile("sync;");
}
}
template <typename T>
__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
const int time_size, const int hw_size,
const int channel_size, const int index,
const int cur_sequence_index,
const int max_length_per_core, T *output) {
for (int cur_index = index; cur_index < index + hw_size;
cur_index += max_length_per_core) {
int memcpy_size = max_length_per_core;
if (cur_index + max_length_per_core > index + hw_size) {
memcpy_size = index + hw_size - cur_index;
}
if (cur_sequence_index - t_shift < 0 ||
cur_sequence_index - t_shift >= time_size) {
__memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
NRAM2GDRAM);
} else {
__memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
memcpy_size * sizeof(T), GDRAM2NRAM);
__memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
NRAM2GDRAM);
}
__asm__ volatile("sync;");
}
}
template <typename T>
__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
const T *input, const int *shifts, T *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const int max_number_hw_per_core, const int max_length_per_core) {
const int tmp_max_number_hw_per_core =
max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
const int loop_time = time_size / tmp_max_number_hw_per_core +
((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
int segmentime_size = tmp_max_number_hw_per_core;
int res_segment = time_size % tmp_max_number_hw_per_core;
for (int cur_segment_index = taskId;
cur_segment_index < loop_time * batch_size * channel_size;
cur_segment_index += taskDim) {
int n_index = cur_segment_index / loop_time / channel_size;
int group_id = cur_segment_index / loop_time % channel_size / group_channel;
int t_shift = shifts[n_index * group_size + group_id];
int index = n_index * time_size * channel_size * hw_size +
(cur_segment_index / loop_time % channel_size) * hw_size +
cur_segment_index % loop_time * segmentime_size * hw_size *
channel_size;
char *dst_gdram2nram = data_nram;
const T *src_gdram2nram = input + index;
int count_gdram2nram = -1;
int count_nram2gdram = -1;
int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
__bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
cur_sequence_index, max_length_per_core, output);
continue;
}
if (abs(t_shift) >= time_size) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
res_segment - 1);
} else {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
}
continue;
}
if (t_shift == 0) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index;
count_gdram2nram = res_segment - 1;
count_nram2gdram = res_segment - 1;
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index;
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
}
} else if (t_shift > 0) {
int first_index_cur_channel =
n_index * time_size * channel_size * hw_size +
(cur_segment_index / loop_time % channel_size) * hw_size;
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
dst_gdram2nram = data_nram;
src_gdram2nram =
input +
(index - t_shift * channel_size * hw_size < first_index_cur_channel
? first_index_cur_channel
: index - t_shift * channel_size * hw_size);
count_gdram2nram = res_segment - 1;
count_nram2gdram = res_segment - 1;
if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
dst_gdram2nram =
data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
}
} else {
if (t_shift >= next_sequence_index) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
continue;
} else if (cur_sequence_index < t_shift &&
t_shift < next_sequence_index) {
dst_gdram2nram =
data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
src_gdram2nram = input + first_index_cur_channel;
count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
count_nram2gdram = segmentime_size - 1;
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index - t_shift * channel_size * hw_size;
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
}
}
} else {
int offset_index = time_size + t_shift;
if (cur_sequence_index >= offset_index) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
res_segment - 1);
continue;
} else {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
continue;
}
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index - t_shift * channel_size * hw_size;
if (cur_sequence_index - t_shift + segmentime_size < time_size) {
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
} else {
count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
count_nram2gdram =
(segmentime_size - 1) < (time_size - cur_sequence_index - 1)
? (segmentime_size - 1)
: (time_size - cur_sequence_index - 1);
}
}
}
__memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
count_gdram2nram);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
count_nram2gdram);
__asm__ volatile("sync;");
}
}
__mlu_entry__ void MLUUnion1KernelTinShift(
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
batch_size, time_size, channel_size, hw_size,
group_size, group_channel);
}; break;
case CNRT_FLOAT32: {
mluMultiKernelTinShift((float *)input, (const int *)shifts,
(float *)output, batch_size, time_size,
channel_size, hw_size, group_size, group_channel);
}; break;
default: { return; }
}
}
__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const int max_number_hw_per_core, const int max_length_per_core,
const cnrtDataType_t data_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
mluMultiKernelTinShiftSplitSequence(
(half *)input, (const int *)shifts, (half *)output, batch_size,
time_size, channel_size, hw_size, group_size, group_channel,
max_number_hw_per_core, max_length_per_core);
}; break;
case CNRT_FLOAT32: {
mluMultiKernelTinShiftSplitSequence(
(float *)input, (const int *)shifts, (float *)output, batch_size,
time_size, channel_size, hw_size, group_size, group_channel,
max_number_hw_per_core, max_length_per_core);
}; break;
default: { return; }
}
}
void KernelTinShiftForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core) {
if (channel_per_core >= 1) {
MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
input, shifts, output, batch_size, time_size, channel_size, hw_size,
group_size, group_channel, data_dtype);
} else {
MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
input, shifts, output, batch_size, time_size, channel_size, hw_size,
group_size, group_channel, max_number_hw_per_core, max_length_per_core,
data_dtype);
}
}
void KernelTinShiftBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *grad_output, const void *shifts, void *grad_input,
const int batch_size, const int time_size, const int channel_size,
const int hw_size, const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core) {
if (channel_per_core >= 1) {
MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
grad_output, shifts, grad_input, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, data_dtype);
} else {
MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
grad_output, shifts, grad_input, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, max_number_hw_per_core,
max_length_per_core, data_dtype);
}
}
mmcv/ops/csrc/common/pytorch_npu_helper.hpp
View file @
91da9643
...
...
@@ -18,7 +18,7 @@
#ifndef PYTORCH_NPU_HELPER_HPP_
#define PYTORCH_NPU_HELPER_HPP_
#include <torch_npu/csrc/aten/
NPUNative
Functions.h>
#include <torch_npu/csrc/aten/
Custom
Functions.h>
#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
#include <torch_npu/csrc/framework/utils/OpAdapter.h>
...
...
@@ -27,9 +27,21 @@
#define NPU_NAME_SPACE at_npu::native
#ifdef MMCV_WITH_XLA
#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
#else
#define REGISTER_NPU_IMPL(key, value) \
REGISTER_DEVICE_IMPL(key, PrivateUse1, value)
#endif
#ifdef MMCV_WITH_XLA
#define CHECK_NPU(x) \
TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
#else
#define CHECK_NPU(x) \
TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \
" must be a NPU " \
"tensor")
#endif
#endif // PYTORCH_NPU_HELPER_HPP_
mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
View file @
91da9643
...
...
@@ -319,8 +319,9 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE
ShapeBase
(
std
::
initializer_list
<
int
>
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
template
<
typename
scalar_t
,
template
<
class
...
>
class
Container
>
ShapeBase
(
Container
<
scalar_t
>
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
// TODO: find out why this template can no be used on windows
// template <typename scalar_t, template <class...> class Container>
// ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE
ShapeBase
(
const
ShapeBase
<
MaxDim
>
&
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
ShapeBase
(
const
std
::
vector
<
int
>
&
arr
)
:
SimpleVector
<
int
,
MaxDim
>
(
arr
)
{}
...
...
mmcv/ops/csrc/parrots/cudabind.cpp
View file @
91da9643
...
...
@@ -564,57 +564,6 @@ REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
REGISTER_DEVICE_IMPL
(
group_points_backward_impl
,
CUDA
,
group_points_backward_cuda
);
void
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
Tensor
ans_overlap
);
void
IoU3DNMS3DForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
Tensor
&
keep
,
Tensor
&
keep_num
,
float
nms_overlap_thresh
);
void
IoU3DNMS3DNormalForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
Tensor
&
keep
,
Tensor
&
keep_num
,
float
nms_overlap_thresh
);
void
iou3d_boxes_overlap_bev_forward_cuda
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
Tensor
ans_overlap
)
{
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher
(
num_a
,
boxes_a
,
num_b
,
boxes_b
,
ans_overlap
);
};
void
iou3d_nms3d_forward_cuda
(
const
Tensor
boxes
,
Tensor
&
keep
,
Tensor
&
keep_num
,
float
nms_overlap_thresh
)
{
IoU3DNMS3DForwardCUDAKernelLauncher
(
boxes
,
keep
,
keep_num
,
nms_overlap_thresh
);
};
void
iou3d_nms3d_normal_forward_cuda
(
const
Tensor
boxes
,
Tensor
&
keep
,
Tensor
&
keep_num
,
float
nms_overlap_thresh
)
{
IoU3DNMS3DNormalForwardCUDAKernelLauncher
(
boxes
,
keep
,
keep_num
,
nms_overlap_thresh
);
};
void
iou3d_boxes_overlap_bev_forward_impl
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
Tensor
ans_overlap
);
void
iou3d_nms3d_forward_impl
(
const
Tensor
boxes
,
Tensor
&
keep
,
Tensor
&
keep_num
,
float
nms_overlap_thresh
);
void
iou3d_nms3d_normal_forward_impl
(
const
Tensor
boxes
,
Tensor
&
keep
,
Tensor
&
keep_num
,
float
nms_overlap_thresh
);
REGISTER_DEVICE_IMPL
(
iou3d_boxes_overlap_bev_forward_impl
,
CUDA
,
iou3d_boxes_overlap_bev_forward_cuda
);
REGISTER_DEVICE_IMPL
(
iou3d_nms3d_forward_impl
,
CUDA
,
iou3d_nms3d_forward_cuda
);
REGISTER_DEVICE_IMPL
(
iou3d_nms3d_normal_forward_impl
,
CUDA
,
iou3d_nms3d_normal_forward_cuda
);
void
KNNForwardCUDAKernelLauncher
(
int
b
,
int
n
,
int
m
,
int
nsample
,
const
Tensor
xyz
,
const
Tensor
new_xyz
,
Tensor
idx
,
Tensor
dist2
);
...
...
mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
View file @
91da9643
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_DIOPI
#include <diopi/diopirt.h>
#include <diopi/functions.h>
#include <diopi/functions_mmcv.h>
#include "csrc_dipu/base/basedef.h"
#include "csrc_dipu/diopirt/diopirt_impl.h"
using
dipu
::
diopi_helper
::
toDiopiScalar
;
using
dipu
::
diopi_helper
::
toDiopiTensorHandle
;
#endif
void
bbox_overlaps_impl
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
...
...
@@ -8,7 +19,42 @@ void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
aligned
,
offset
);
}
#ifdef MMCV_WITH_DIOPI
void
bbox_overlaps_diopi
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
auto
bboxes1_p
=
toDiopiTensorHandle
(
bboxes1
);
diopiDevice_t
device
;
diopiGetTensorDevice
(
bboxes1_p
,
&
device
);
if
(
device
==
diopi_host
)
{
bbox_overlaps_impl
(
bboxes1
,
bboxes2
,
ious
,
mode
,
aligned
,
offset
);
return
;
}
diopiContext
ctx
(
dipu
::
getCurrentDIPUStream
().
rawstream
());
diopiContextHandle_t
ch
=
&
ctx
;
auto
bboxes2_p
=
toDiopiTensorHandle
(
bboxes2
);
auto
ious_p
=
toDiopiTensorHandle
(
ious
);
bool
is_mock_cuda
=
bboxes1
.
device
().
type
()
==
dipu
::
DIPU_DEVICE_TYPE
;
if
(
is_mock_cuda
&&
reinterpret_cast
<
void
*>
(
diopiBboxOverlapsMmcv
)
!=
nullptr
)
{
auto
ret
=
diopiBboxOverlapsMmcv
(
ch
,
ious_p
,
bboxes1_p
,
bboxes2_p
,
mode
,
offset
,
aligned
);
if
(
ret
==
diopiSuccess
)
return
;
}
LOG
(
WARNING
)
<<
"Fallback to cpu: mmcv ext op bbox_overlaps"
;
auto
bboxes1_cpu
=
bboxes1
.
cpu
();
auto
bboxes2_cpu
=
bboxes2
.
cpu
();
auto
ious_cpu
=
ious
.
cpu
();
bbox_overlaps_impl
(
bboxes1_cpu
,
bboxes2_cpu
,
ious_cpu
,
mode
,
aligned
,
offset
);
ious
.
copy_
(
ious_cpu
);
}
#endif
void
bbox_overlaps
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
#ifdef MMCV_WITH_DIOPI
bbox_overlaps_diopi
(
bboxes1
,
bboxes2
,
ious
,
mode
,
aligned
,
offset
);
#else
bbox_overlaps_impl
(
bboxes1
,
bboxes2
,
ious
,
mode
,
aligned
,
offset
);
#endif
}
mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
View file @
91da9643
...
...
@@ -289,12 +289,13 @@ torch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,
int
blockSize
=
4
*
32
;
int
gridSize
=
(
p
.
sizeX
-
1
)
/
(
p
.
loopX
*
blockSize
)
+
1
;
void
*
args
[]
=
{
&
p
};
#ifndef MMCV_WITH_HIP
AT_CUDA_CHECK
(
cudaLaunchKernel
(
kernel
,
gridSize
,
blockSize
,
args
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()));
#else
#ifdef MMCV_WITH_HIP
AT_CUDA_CHECK
(
hipLaunchKernel
(
kernel
,
gridSize
,
blockSize
,
args
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()));
#else
AT_CUDA_CHECK
(
cudaLaunchKernel
(
kernel
,
gridSize
,
blockSize
,
args
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()));
#endif
return
y
;
}
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment