Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
6f3c5f1c
Commit
6f3c5f1c
authored
Jul 11, 2024
by
limm
Browse files
support v1.4.0
parent
6f674c7e
Changes
339
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4 additions
and
6483 deletions
+4
-6483
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+0
-483
mmcv/ops/csrc/common/mlu/nms_utils.hpp
mmcv/ops/csrc/common/mlu/nms_utils.hpp
+0
-553
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+0
-615
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
+0
-55
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+0
-493
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
+0
-490
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
+0
-24
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+0
-747
mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
+0
-747
mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
...common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
+0
-536
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
+0
-544
mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
+0
-466
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+0
-307
mmcv/ops/csrc/common/mps/MPSDevice.h
mmcv/ops/csrc/common/mps/MPSDevice.h
+0
-64
mmcv/ops/csrc/common/mps/MPSLibrary.h
mmcv/ops/csrc/common/mps/MPSLibrary.h
+0
-61
mmcv/ops/csrc/common/mps/MPSLibrary.mm
mmcv/ops/csrc/common/mps/MPSLibrary.mm
+0
-107
mmcv/ops/csrc/common/mps/MPSStream.h
mmcv/ops/csrc/common/mps/MPSStream.h
+0
-132
mmcv/ops/csrc/common/mps/MPSUtils.h
mmcv/ops/csrc/common/mps/MPSUtils.h
+0
-51
mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+4
-7
mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
+0
-1
No files found.
Too many changes to show.
To preserve performance only
339 of 339+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "nms_utils.hpp"
#define COORD_DIM (4)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
enum Addr { SRAM, GDRAM };
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection(
uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
IN_DT *sram, const int core_limit, const int input_num_boxes,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float offset, const int algo) {
// global value
int32_t *exit_flag = (int32_t *)(sram + 28);
exit_flag[0] = 0;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
const IN_DT *input_x1_ptr = input_data_box;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int input_offset = 0; // offset of input_data for current core
int nram_save_count = 0;
if (output_mode == 0) {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
// 5 maens: score, x1, y1, x2, y2
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * 5 * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
int max_seg_iou_compute = 0;
int repeat_iou_compute = 0;
int remain_iou_compute = 0;
int remain_pad_iou_compute = 0;
getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
input_offset, max_seg_pad, repeat, remain,
remain_pad, max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the data ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
if (core_limit != 1) {
__sync_cluster(); // sync before current loop
}
/******FIND MAX START******/
int max_index = 0; // the max score index
int global_max_index = 0; // for U1
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
input_offset, repeat, remain, remain_pad, max_seg_pad,
max_index);
if (core_limit == 1) {
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
input_data_score[max_index] = 0;
global_max_index = max_index;
} else if (core_limit == 4) {
__sync_cluster();
findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
global_max_index = ((uint32_t *)(max_box + 5))[0];
input_data_score[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
max_output_size, thresh_score, output_mode, nram_save_count,
output_box_num);
// if the max score <= 0, end
if (core_limit == 1) {
if (float(max_box[0]) <= thresh_score) {
break;
}
} else {
if (float(max_box[0]) <= thresh_score) {
if (coreId == 0) {
exit_flag[0] = 1;
}
}
__sync_cluster();
if (exit_flag[0] == 1) {
break;
}
}
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
max_box_y1, max_box_x2, max_box_y2, nram_save,
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
input_offset, offset, max_area, input_num_boxes, algo);
#else
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
max_box[2], max_box[3], max_box[4], nram_save,
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
input_offset, offset, max_area, input_num_boxes, algo);
#endif
} // for max_output_size
}
__mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int max_output_size,
const float iou_threshold, const float confidence_threshold,
const int output_mode, void *workspace, void *result_num, void *output,
const cnrtDataType_t data_type_input, const float offset, const int algo) {
if (data_type_input == CNRT_FLOAT16) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
GDRAM2GDRAM);
} else if (data_type_input == CNRT_FLOAT32) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
GDRAM2GDRAM);
} else {
}
uint32_t output_box_num = 0;
float *score_data = (float *)workspace;
float *boxes_data = (float *)input_boxes;
float *sram = (float *)sram_buffer;
if (output_mode == 0) {
if (data_type_input == CNRT_FLOAT32) {
nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, algo);
} else {
nms_detection(output_box_num, output_mode, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
} else {
if (data_type_input == CNRT_FLOAT32) {
nms_detection(output_box_num, output_mode, (float *)output, score_data,
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, algo);
} else {
nms_detection(output_box_num, output_mode, (half *)output,
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
taskDim, input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux(
int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_num_boxes, const int max_output_size,
const float thresh_iou, const float thresh_score, const float offset,
const int output_mode, const int algo, char *cdma_gdram) {
exit_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
const IN_DT *input_x1_ptr = boxes_data;
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int nram_save_count = 0;
if (output_mode == 0) {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
int input_offset = 0;
int max_seg_iou_compute = 0;
int repeat_iou_compute = 0;
int remain_iou_compute = 0;
int remain_pad_iou_compute = 0;
getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
max_seg_pad, repeat, remain, remain_pad,
max_seg_iou_compute, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute);
// init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
#if __BANG_ARCH__ >= 300
float max_box_x1 = 0;
float max_box_y1 = 0;
float max_box_x2 = 0;
float max_box_y2 = 0;
#endif
mluMemcpyDirection_t load_dir = SRAM2NRAM;
mluMemcpyDirection_t store_dir = NRAM2SRAM;
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
__sync_all();
int max_index = 0;
int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
if (coreId == 0) {
findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
input_offset, repeat, remain, remain_pad, max_seg_pad,
max_index);
// copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_all();
#if __BANG_ARCH__ >= 590
__memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram,
REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM);
__sync_all();
if (clusterId == 0 && coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM,
sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy((char *)cdma_gdram,
(char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM);
}
__sync_all();
__memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM);
#else
findGlobalMaxBox(max_box, sram, inter_x1);
#endif
#if __BANG_ARCH__ >= 300
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
max_box_x2, max_box_y2);
#else
calMaxArea(max_box, algo, offset, max_area);
#endif
global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId != MEMORY_CORE) {
score_data[global_max_index] = 0;
}
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
max_output_size, thresh_score, output_mode, nram_save_count,
output_box_num);
if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) {
exit_flag[0] = 1; // dram
}
}
__sync_all();
if (exit_flag[0] == 1) {
break;
}
/******NMS STORE END******/
#if __BANG_ARCH__ >= 300
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
max_area, input_num_boxes, algo);
#else
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
max_box[3], max_box[4], nram_save, repeat_iou_compute,
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
max_area, input_num_boxes, algo);
#endif
} // for max_output_size
}
__mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int max_output_size,
const float iou_threshold, const float confidence_threshold,
const float offset, const cnrtDataType_t data_type_input,
const int output_mode, const int algo, void *workspace, void *result_num,
void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *exit_flag = (int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
char *cdma_addr = (char *)exit_flag + sizeof(int32_t);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
int cluster_score_size = input_num_boxes * input_dwidth;
int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
char *sram_score = (char *)sram_buffer + reduce_sram_size;
char *sram_boxes =
(char *)sram_buffer + reduce_sram_size + cluster_score_size;
Addr input_ram = GDRAM;
if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
input_ram = SRAM;
__memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
__memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
} else {
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
}
__sync_cluster();
uint32_t output_box_num = 0;
float *score_data;
float *boxes_data;
score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
if (output_mode == 0) {
if (data_type_input == CNRT_FLOAT32) {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
score_data, boxes_data, input_ram, input_num_boxes,
max_output_size, iou_threshold, confidence_threshold,
offset, output_mode, algo, cdma_addr);
} else {
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
(half *)score_data, (half *)boxes_data, input_ram,
input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo,
cdma_addr);
}
} else {
if (data_type_input == CNRT_FLOAT32) {
nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
boxes_data, input_ram, input_num_boxes, max_output_size,
iou_threshold, confidence_threshold, offset, output_mode,
algo, cdma_addr);
} else {
nms_detection_ux(exit_flag, output_box_num, (half *)output,
(half *)score_data, (half *)boxes_data, input_ram,
input_num_boxes, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo,
cdma_addr);
}
}
((uint32_t *)result_num)[0] = output_box_num;
}
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int max_output_boxes, const float iou_threshold,
const float offset, void *workspace_ptr, void *output_size_ptr,
void *output_ptr) {
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
}; break;
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION4:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
output_size_ptr, output_ptr);
}; break;
}
}
mmcv/ops/csrc/common/mlu/nms_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) [2019-2022] by Cambricon, Inc.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef NMS_UTILS_HPP_
#define NMS_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define NMS_SIZE (64)
#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define NMS_DOWN(x, y) (x / y) * y
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define MEMORY_CORE (0x80)
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
__mlu_func__
void
pvLock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_lock
(
0
,
0
);
}
#endif
}
__mlu_func__
void
pvUnlock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_unlock
(
0
,
0
);
}
#endif
}
template
<
typename
T
>
static
__mlu_func__
void
computeReluN
(
T
*
nram_dst
,
T
*
nram_src
,
void
*
nram_tmp
,
const
int
deal_num
,
const
T
threshold
=
0
)
{
if
(
threshold
<
0
)
{
return
;
}
if
(
threshold
)
{
#if __BANG_ARCH__ >= 300
__bang_relun
(
nram_dst
,
nram_src
,
deal_num
,
threshold
);
#else
int
align_num
=
NFU_ALIGN_SIZE
/
sizeof
(
T
);
T
*
nram_aux_a
=
(
T
*
)
nram_tmp
;
T
*
nram_aux_b
=
nram_aux_a
+
deal_num
;
T
*
nram_zero
=
nram_aux_b
+
align_num
;
__bang_write_value
(
nram_aux_b
,
align_num
,
threshold
);
__bang_write_zero
(
nram_zero
,
align_num
);
__bang_cycle_lt
((
T
*
)
nram_aux_a
,
nram_src
,
(
T
*
)
nram_aux_b
,
deal_num
,
align_num
);
__bang_mul
(
nram_dst
,
nram_src
,
(
T
*
)
nram_aux_a
,
deal_num
);
__bang_cycle_eq
((
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_a
,
(
T
*
)
nram_zero
,
deal_num
,
align_num
);
__bang_cycle_mul
((
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_a
,
(
T
*
)
nram_aux_b
,
deal_num
,
align_num
);
__bang_add
(
nram_dst
,
nram_dst
,
(
T
*
)
nram_aux_a
,
deal_num
);
__bang_cycle_gt
((
T
*
)
nram_aux_a
,
nram_dst
,
(
T
*
)
nram_zero
,
deal_num
,
align_num
);
__bang_mul
(
nram_dst
,
nram_dst
,
(
T
*
)
nram_aux_a
,
deal_num
);
#endif
}
else
{
#if __BANG_ARCH__ >= 300
__bang_relu
(
nram_dst
,
nram_src
,
deal_num
);
#else
__bang_active_relu
(
nram_dst
,
nram_src
,
deal_num
);
#endif
}
}
__mlu_func__
void
getComputeParamsBlockOrU1
(
const
int
input_dwidth
,
const
int
input_box_num
,
const
int
limit
,
const
int
core_limit
,
int
&
input_offset
,
int
&
max_seg_pad
,
int
&
repeat
,
int
&
remain
,
int
&
remain_pad
,
int
&
max_seg_iou_compute
,
int
&
repeat_iou_compute
,
int
&
remain_iou_compute
,
int
&
remain_pad_iou_compute
)
{
int
avg_core
=
input_box_num
/
core_limit
;
int
rem
=
input_box_num
%
core_limit
;
int
len_core
=
avg_core
+
(
coreId
<
rem
?
1
:
0
);
input_offset
=
avg_core
*
coreId
+
(
coreId
<=
rem
?
coreId
:
rem
);
max_seg_pad
=
NMS_DOWN
(
limit
,
NMS_SIZE
);
repeat
=
len_core
/
max_seg_pad
;
remain
=
len_core
%
max_seg_pad
;
remain_pad
=
NMS_UP
(
remain
,
NMS_SIZE
);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute
=
NMS_DOWN
(
max_seg_pad
/
(
4
/
input_dwidth
),
NMS_SIZE
);
repeat_iou_compute
=
len_core
/
max_seg_iou_compute
;
remain_iou_compute
=
len_core
%
max_seg_iou_compute
;
remain_pad_iou_compute
=
NMS_UP
(
remain_iou_compute
,
NMS_SIZE
);
}
__mlu_func__
void
getComputeParamsUx
(
const
int
input_dwidth
,
const
int
input_num_boxes
,
const
int
limit
,
int
&
input_offset
,
int
&
max_seg_pad
,
int
&
repeat
,
int
&
remain
,
int
&
remain_pad
,
int
&
max_seg_iou_compute
,
int
&
repeat_iou_compute
,
int
&
remain_iou_compute
,
int
&
remain_pad_iou_compute
)
{
// data split
int
avg_cluster
=
input_num_boxes
/
clusterDim
;
int
rem_cluster
=
input_num_boxes
%
clusterDim
;
int
len_cluster
=
avg_cluster
+
(
clusterId
<
rem_cluster
);
int
cluster_offset
=
avg_cluster
*
clusterId
+
(
clusterId
<=
rem_cluster
?
clusterId
:
rem_cluster
);
int
avg_core
=
len_cluster
/
coreDim
;
int
rem_core
=
len_cluster
%
coreDim
;
int
len_core
=
avg_core
+
(
coreId
<
rem_core
);
int
core_offset
=
avg_core
*
coreId
+
(
coreId
<=
rem_core
?
coreId
:
rem_core
);
input_offset
=
cluster_offset
+
core_offset
;
max_seg_pad
=
NMS_DOWN
(
limit
,
NMS_SIZE
);
// core 0 of each cluster calculate the max score index
int
max_index_len_core
=
avg_cluster
+
(
clusterId
<
rem_cluster
);
repeat
=
max_index_len_core
/
max_seg_pad
;
remain
=
max_index_len_core
%
max_seg_pad
;
remain_pad
=
NMS_UP
(
remain
,
NMS_SIZE
);
// if datatype is fp16, we should cvt to fp32 when compute iou
max_seg_iou_compute
=
NMS_DOWN
(
max_seg_pad
/
(
sizeof
(
float
)
/
input_dwidth
),
NMS_SIZE
);
repeat_iou_compute
=
len_core
/
max_seg_iou_compute
;
remain_iou_compute
=
len_core
%
max_seg_iou_compute
;
remain_pad_iou_compute
=
NMS_UP
(
remain_iou_compute
,
NMS_SIZE
);
}
template
<
typename
IN_DT
>
__mlu_func__
void
findGlobalMaxBox
(
IN_DT
*
max_box
,
IN_DT
*
sram
,
IN_DT
*
inter_x1
)
{
// copy all partial max to the sram of cluster 0
if
(
clusterId
!=
0
)
{
__memcpy
(
sram
+
REDUCE_NUM
*
clusterId
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2SRAM
,
0
);
}
__sync_all
();
// reduce between clusters to get the global max box
if
(
clusterId
==
0
)
{
if
(
coreId
==
0
)
{
__bang_write_zero
(
inter_x1
,
NMS_SIZE
);
__memcpy
(
inter_x1
,
sram
,
sizeof
(
IN_DT
),
SRAM2NRAM
,
sizeof
(
IN_DT
),
REDUCE_NUM
*
sizeof
(
IN_DT
),
clusterDim
-
1
);
__bang_max
(
max_box
,
inter_x1
,
NMS_SIZE
);
int
max_cluster
=
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
?
((
uint16_t
*
)
max_box
)[
1
]
:
((
uint32_t
*
)
max_box
)[
1
];
__memcpy
(
max_box
,
sram
+
max_cluster
*
REDUCE_NUM
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
__memcpy
(
sram
,
max_box
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
NRAM2SRAM
);
}
__sync_cluster
();
if
(
coreId
==
0x80
&&
clusterDim
>
1
)
{
// broadcast global max box to each cluster's sram
for
(
int
cluster_idx
=
1
;
cluster_idx
<
clusterDim
;
++
cluster_idx
)
{
__memcpy
(
sram
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2SRAM
,
cluster_idx
);
}
}
__sync_cluster
();
}
__sync_all
();
// copy the global max box to max_box
__memcpy
(
max_box
,
sram
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
}
template
<
typename
IN_DT
>
__mlu_func__
void
findCoreMaxBox
(
IN_DT
*
input_score_ptr
,
IN_DT
*
score
,
IN_DT
*
inter_x1
,
IN_DT
*
max_box
,
const
IN_DT
*
input_x1_ptr
,
const
IN_DT
*
input_y1_ptr
,
const
IN_DT
*
input_x2_ptr
,
const
IN_DT
*
input_y2_ptr
,
const
mluMemcpyDirection_t
load_dir
,
const
int
input_offset
,
const
int
repeat
,
const
int
remain
,
const
int
remain_pad
,
const
int
max_seg_pad
,
int
&
max_index
)
{
if
(
coreId
!=
0x80
)
{
for
(
int
i
=
0
;
i
<=
repeat
;
i
++
)
{
if
(
i
==
repeat
&&
remain
==
0
)
{
break
;
}
int
seg_len
=
0
;
// the length every nms compute
int
cpy_len
=
0
;
// the length every nms memcpy
i
==
repeat
?
seg_len
=
remain_pad
:
seg_len
=
max_seg_pad
;
i
==
repeat
?
cpy_len
=
remain
:
cpy_len
=
max_seg_pad
;
/******NMS LOAD START******/
__bang_write_zero
(
score
,
seg_len
);
__memcpy
(
score
,
input_score_ptr
+
input_offset
+
i
*
max_seg_pad
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
/******NMS LOAD END******/
__bang_max
(
inter_x1
,
score
,
seg_len
);
if
(
inter_x1
[
0
]
>
max_box
[
0
])
{
max_box
[
0
]
=
inter_x1
[
0
];
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
max_index
=
((
uint16_t
*
)
inter_x1
)[
1
]
+
input_offset
+
i
*
max_seg_pad
;
// offset start from head of input_data
}
else
if
(
sizeof
(
IN_DT
)
==
sizeof
(
float
))
{
max_index
=
((
uint32_t
*
)
inter_x1
)[
1
]
+
input_offset
+
i
*
max_seg_pad
;
// offset start from head of input_data
}
}
}
// for repeat
// the max box's x1, y1, x2, y2 on every core
max_box
[
1
]
=
input_x1_ptr
[
max_index
];
max_box
[
2
]
=
input_y1_ptr
[
max_index
];
max_box
[
3
]
=
input_x2_ptr
[
max_index
];
max_box
[
4
]
=
input_y2_ptr
[
max_index
];
((
uint32_t
*
)(
max_box
+
5
))[
0
]
=
max_index
;
}
}
template
<
typename
IN_DT
>
__mlu_func__
void
findClusterMaxBox
(
IN_DT
*
sram
,
IN_DT
*
max_box
,
IN_DT
*
inter_x1
,
IN_DT
*
input_data_score
,
const
int
core_limit
)
{
// find the max with sram
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
__memcpy
(
sram
+
REDUCE_NUM
*
coreId
,
max_box
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
NRAM2SRAM
);
// int32_t datatype
__sync_cluster
();
// copy score from sram to nram and find the max
__bang_write_zero
(
inter_x1
,
64
);
__memcpy
(
inter_x1
,
sram
,
sizeof
(
IN_DT
),
SRAM2NRAM
,
sizeof
(
IN_DT
),
REDUCE_NUM
*
sizeof
(
IN_DT
),
coreDim
-
1
);
__bang_max
(
max_box
,
inter_x1
,
64
);
int
max_core
=
sizeof
(
IN_DT
)
==
sizeof
(
half
)
?
((
uint16_t
*
)
max_box
)[
1
]
:
((
uint32_t
*
)
max_box
)[
1
];
// copy the max box to max_box
__memcpy
(
max_box
,
sram
+
max_core
*
REDUCE_NUM
,
REDUCE_NUM
*
sizeof
(
IN_DT
),
SRAM2NRAM
);
}
/*****************************************************************************/
/*******************************CALCULATE MAX AREA****************************/
/*****************************************************************************/
template
<
typename
IN_DT
>
__mlu_func__
void
calMaxArea
(
IN_DT
*
max_box
,
const
int
algo
,
float
offset
,
float
&
max_area
)
{
if
(
algo
==
0
||
offset
==
0.0
)
{
max_area
=
((
float
)
max_box
[
3
]
-
(
float
)
max_box
[
1
])
*
((
float
)
max_box
[
4
]
-
(
float
)
max_box
[
2
]);
}
else
{
max_area
=
((
float
)
max_box
[
3
]
-
(
float
)
max_box
[
1
]
+
offset
)
*
((
float
)
max_box
[
4
]
-
(
float
)
max_box
[
2
]
+
offset
);
}
}
template
<
typename
IN_DT
>
__mlu_func__
void
calMaxArea
(
IN_DT
*
max_box
,
const
int
algo
,
float
offset
,
float
&
max_area
,
float
&
max_box_x1
,
float
&
max_box_y1
,
float
&
max_box_x2
,
float
&
max_box_y2
)
{
// the case of random inf will break the requirement of x1<=x2, y1<=y2
// so exchange it if it happens.
max_box_x1
=
float
(
max_box
[
1
]);
max_box_x2
=
float
(
max_box
[
3
]);
if
(
max_box
[
1
]
>
max_box
[
3
])
{
max_box_x1
=
float
(
max_box
[
3
]);
max_box_x2
=
float
(
max_box
[
1
]);
}
max_box_y1
=
float
(
max_box
[
2
]);
max_box_y2
=
float
(
max_box
[
4
]);
if
(
max_box
[
2
]
>
max_box
[
4
])
{
max_box_y1
=
float
(
max_box
[
4
]);
max_box_y2
=
float
(
max_box
[
2
]);
}
if
(
algo
==
0
||
offset
==
0.0
)
{
max_area
=
(
max_box_x2
-
max_box_x1
)
*
(
max_box_y2
-
max_box_y1
);
}
else
{
max_area
=
(
max_box_x2
-
max_box_x1
+
offset
)
*
(
max_box_y2
-
max_box_y1
+
offset
);
}
}
/***********************************************************************/
/*******************************STORE RESULT****************************/
/***********************************************************************/
template
<
typename
IN_DT
,
typename
OUT_DT
>
__mlu_func__
void
storeResult
(
IN_DT
*
max_box
,
OUT_DT
*
nram_save
,
OUT_DT
*&
output_dram
,
const
int
keep
,
const
int
nram_save_limit_count
,
const
int
max_output_size
,
const
float
thresh_score
,
const
int
output_mode
,
int
&
nram_save_count
,
uint32_t
&
output_box_num
)
{
/******NMS STORE START******/
// store to nram
if
(
float
(
max_box
[
0
])
>
thresh_score
)
{
OUT_DT
*
save_ptr
;
int
save_offset
=
0
;
int
save_str_num
=
0
;
save_ptr
=
nram_save
;
save_offset
=
nram_save_count
;
save_str_num
=
nram_save_limit_count
;
if
(
clusterId
==
0
&&
coreId
==
0
)
{
if
(
output_mode
==
0
)
{
// index1, index2, ...
save_ptr
[
save_offset
]
=
((
uint32_t
*
)(
max_box
+
INFO_NUM
))[
0
];
}
else
if
(
output_mode
==
1
)
{
// score, x1, y1, x2, y2
__memcpy
(
save_ptr
+
save_offset
*
INFO_NUM
,
max_box
,
INFO_NUM
*
sizeof
(
IN_DT
),
NRAM2NRAM
,
INFO_NUM
*
sizeof
(
IN_DT
),
INFO_NUM
*
sizeof
(
IN_DT
),
0
);
}
else
if
(
output_mode
==
2
)
{
// score---, x1---, y1---, x2---, y2---
__memcpy
(
save_ptr
+
save_offset
,
max_box
,
1
*
sizeof
(
IN_DT
),
NRAM2NRAM
,
save_str_num
*
sizeof
(
IN_DT
),
1
*
sizeof
(
IN_DT
),
4
);
}
}
nram_save_count
++
;
output_box_num
++
;
}
// store to sram/gdram
if
(
output_box_num
!=
0
)
{
if
((
nram_save_count
==
nram_save_limit_count
)
||
(
float
(
max_box
[
0
])
<=
thresh_score
)
||
keep
==
max_output_size
-
1
)
{
if
(
nram_save_count
!=
0
)
{
if
(
clusterId
==
0
&&
coreId
==
0
)
{
if
(
output_mode
==
0
)
{
// index1, index2, ...
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
sizeof
(
uint32_t
),
NRAM2GDRAM
);
pvUnlock
();
output_dram
+=
nram_save_count
;
}
else
if
(
output_mode
==
1
)
{
// score, x1, y1, x2, y2
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
INFO_NUM
*
sizeof
(
IN_DT
),
NRAM2GDRAM
);
pvUnlock
();
output_dram
+=
nram_save_count
*
INFO_NUM
;
}
else
if
(
output_mode
==
2
)
{
// score---, x1---, y1---, x2---, y2---
pvLock
();
__memcpy
(
output_dram
,
nram_save
,
nram_save_count
*
sizeof
(
IN_DT
),
NRAM2GDRAM
,
max_output_size
*
sizeof
(
IN_DT
),
nram_save_limit_count
*
sizeof
(
IN_DT
),
4
);
pvUnlock
();
output_dram
+=
nram_save_count
;
}
nram_save_count
=
0
;
}
}
}
// if move data nram->sram/gdram
}
// if dst
}
template
<
typename
IN_DT
,
typename
OUT_DT
>
__mlu_func__
void
scoreUpdate
(
IN_DT
*
input_score_ptr
,
const
mluMemcpyDirection_t
load_dir
,
const
mluMemcpyDirection_t
store_dir
,
const
IN_DT
*
input_x1_ptr
,
const
IN_DT
*
input_y1_ptr
,
const
IN_DT
*
input_x2_ptr
,
const
IN_DT
*
input_y2_ptr
,
IN_DT
*
x1
,
IN_DT
*
y1
,
IN_DT
*
x2
,
IN_DT
*
y2
,
IN_DT
*
score
,
IN_DT
*
inter_x1
,
IN_DT
*
inter_y1
,
IN_DT
*
inter_x2
,
IN_DT
*
inter_y2
,
IN_DT
*
max_box
,
const
float
max_box_x1
,
const
float
max_box_y1
,
const
float
max_box_x2
,
const
float
max_box_y2
,
OUT_DT
*
nram_save
,
int
repeat_iou_compute
,
int
remain_iou_compute
,
int
remain_pad_iou_compute
,
int
max_seg_iou_compute
,
int
max_seg_pad
,
const
float
thresh_iou
,
const
float
div_thresh_iou
,
const
int
input_offset
,
const
float
offset
,
const
float
max_area
,
const
int
input_num_boxes
,
const
int
algo
)
{
for
(
int
i
=
0
;
i
<=
repeat_iou_compute
;
i
++
)
{
if
(
i
==
repeat_iou_compute
&&
remain_iou_compute
==
0
)
{
break
;
}
int
seg_len
=
(
i
==
repeat_iou_compute
)
?
remain_pad_iou_compute
:
max_seg_iou_compute
;
int
cpy_len
=
(
i
==
repeat_iou_compute
)
?
remain_iou_compute
:
max_seg_iou_compute
;
/******NMS LOAD START******/
int
dt_offset
=
0
;
if
(
sizeof
(
IN_DT
)
==
sizeof
(
float
))
{
__memcpy
(
score
,
input_score_ptr
+
input_offset
+
i
*
max_seg_pad
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
dt_offset
=
0
;
}
else
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__memcpy
(
x1
,
input_score_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
__bang_half2float
((
float
*
)
score
,
(
half
*
)
x1
,
seg_len
);
dt_offset
=
max_seg_iou_compute
;
}
#if __BANG_ARCH__ >= 300
__memcpy
(
inter_x1
+
dt_offset
,
input_x1_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
max_seg_pad
*
sizeof
(
IN_DT
),
input_num_boxes
*
sizeof
(
IN_DT
),
3
);
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__bang_half2float
((
float
*
)
inter_x1
,
(
half
*
)
inter_x1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_y1
,
(
half
*
)
inter_y1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_x2
,
(
half
*
)
inter_x2
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
inter_y2
,
(
half
*
)
inter_y2
+
max_seg_iou_compute
,
seg_len
);
}
// box transfer
__bang_minequal
((
float
*
)
x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
seg_len
);
__bang_maxequal
((
float
*
)
x2
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
seg_len
);
__bang_minequal
((
float
*
)
y1
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
__bang_maxequal
((
float
*
)
y2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// 1、 compute IOU
// get the area_I
__bang_maxeq_scalar
((
float
*
)
inter_x1
,
(
float
*
)
x1
,
max_box_x1
,
seg_len
);
// inter_x1
__bang_mineq_scalar
((
float
*
)
inter_x2
,
(
float
*
)
x2
,
max_box_x2
,
seg_len
);
// inter_x2
__bang_sub
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
NULL
,
seg_len
);
// inter_w
__bang_maxeq_scalar
((
float
*
)
inter_y1
,
(
float
*
)
y1
,
float
(
max_box_y1
),
seg_len
);
// inter_y1
__bang_mineq_scalar
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
float
(
max_box_y2
),
seg_len
);
// inter_y2
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
(
float
*
)
inter_y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
NULL
,
seg_len
);
// inter_h
__bang_mul
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_y1
,
seg_len
);
// area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_fusion
(
FUSION_FSA
,
(
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
offset
,
seg_len
,
seg_len
);
__bang_fusion
(
FUSION_FSA
,
(
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
offset
,
seg_len
,
seg_len
);
__bang_mul
((
float
*
)
inter_x2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// area
}
else
{
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
seg_len
);
__bang_fusion
(
FUSION_FSM
,
(
float
*
)
inter_x2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
(
float
*
)
inter_y1
,
seg_len
,
seg_len
);
}
// get the area_U: area + max_area - area_I
__bang_fusion
(
FUSION_FAS
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
max_area
,
(
float
*
)
inter_x1
,
seg_len
,
seg_len
);
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if
(
thresh_iou
>
0.0
)
{
__bang_mul_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
div_thresh_iou
,
seg_len
);
}
else
{
__bang_mul_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
thresh_iou
,
seg_len
);
}
// process for nan
__bang_lt
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_not
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_mul
((
float
*
)
score
,
(
float
*
)
score
,
(
float
*
)
inter_x1
,
seg_len
);
/******NMS COMPUTE END******/
#else
__memcpy
(
x1
+
dt_offset
,
input_x1_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
cpy_len
*
sizeof
(
IN_DT
),
load_dir
,
max_seg_pad
*
sizeof
(
IN_DT
),
input_num_boxes
*
sizeof
(
IN_DT
),
3
);
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
__bang_half2float
((
float
*
)
x1
,
(
half
*
)
x1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
y1
,
(
half
*
)
y1
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
x2
,
(
half
*
)
x2
+
max_seg_iou_compute
,
seg_len
);
__bang_half2float
((
float
*
)
y2
,
(
half
*
)
y2
+
max_seg_iou_compute
,
seg_len
);
}
// 1、 compute IOU
// get the area_I
__bang_write_value
((
float
*
)
inter_y1
,
seg_len
,
float
(
max_box
[
1
]));
// max_x1
__bang_maxequal
((
float
*
)
inter_x1
,
(
float
*
)
x1
,
(
float
*
)
inter_y1
,
seg_len
);
// inter_x1
__bang_write_value
((
float
*
)
inter_y2
,
seg_len
,
float
(
max_box
[
3
]));
// max_x2
__bang_minequal
((
float
*
)
inter_x2
,
(
float
*
)
x2
,
(
float
*
)
inter_y2
,
seg_len
);
// inter_x2
__bang_sub
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
NULL
,
seg_len
);
// inter_w
__bang_write_value
((
float
*
)
inter_x2
,
seg_len
,
float
(
max_box
[
2
]));
// max_y1
__bang_maxequal
((
float
*
)
inter_y1
,
(
float
*
)
y1
,
(
float
*
)
inter_x2
,
seg_len
);
// inter_y1
__bang_write_value
((
float
*
)
inter_x2
,
seg_len
,
float
(
max_box
[
4
]));
// max_y2
__bang_minequal
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
inter_x2
,
seg_len
);
// inter_y2
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
(
float
*
)
inter_y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
}
computeReluN
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
NULL
,
seg_len
);
// inter_h
__bang_mul
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
(
float
*
)
inter_y1
,
seg_len
);
// area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub
((
float
*
)
inter_y1
,
(
float
*
)
x2
,
(
float
*
)
x1
,
seg_len
);
__bang_sub
((
float
*
)
inter_y2
,
(
float
*
)
y2
,
(
float
*
)
y1
,
seg_len
);
if
(
algo
==
1
&&
offset
!=
0.0
)
{
__bang_add_scalar
((
float
*
)
inter_y1
,
(
float
*
)
inter_y1
,
offset
,
seg_len
);
__bang_add_scalar
((
float
*
)
inter_y2
,
(
float
*
)
inter_y2
,
offset
,
seg_len
);
}
__bang_mul
((
float
*
)
inter_x2
,
(
float
*
)
inter_y1
,
(
float
*
)
inter_y2
,
seg_len
);
// area
// get the area_U: area + max_area - area_I
__bang_add_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
float
(
max_area
),
seg_len
);
__bang_sub
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
// area_U
// 2、 select the box
// if IOU greater than thresh, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if
(
thresh_iou
>
0.0
)
{
__bang_mul_scalar
((
float
*
)
inter_x1
,
(
float
*
)
inter_x1
,
div_thresh_iou
,
seg_len
);
}
else
{
__bang_mul_scalar
((
float
*
)
inter_x2
,
(
float
*
)
inter_x2
,
thresh_iou
,
seg_len
);
}
__bang_ge
((
float
*
)
inter_x1
,
(
float
*
)
inter_x2
,
(
float
*
)
inter_x1
,
seg_len
);
__bang_mul
((
float
*
)
score
,
(
float
*
)
score
,
(
float
*
)
inter_x1
,
seg_len
);
/******NMS COMPUTE END******/
#endif
// update the score
if
(
sizeof
(
IN_DT
)
==
sizeof
(
half
))
{
convertFloat2half
((
half
*
)
score
,
(
float
*
)
score
,
seg_len
);
}
pvLock
();
__memcpy
(
input_score_ptr
+
input_offset
+
i
*
max_seg_iou_compute
,
score
,
cpy_len
*
sizeof
(
IN_DT
),
store_dir
,
cpy_len
*
sizeof
(
IN_DT
),
cpy_len
*
sizeof
(
IN_DT
),
0
);
pvUnlock
();
}
}
#endif // NMS_UTILS_HPP_
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "psamask_utils.hpp"
#define COMPUTE_COUNT_ALIGN 64
__nram__ char buf[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
const PositionInCore &position,
const Shape &shape_full) {
int n_offset = shape_full.h * shape_full.w * shape_full.c;
int h_offset = shape_full.w * shape_full.c;
int w_offset = shape_full.c;
int n_seg = position.n_end - position.n_start;
int h_seg = position.h_end - position.h_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
position.w_start * w_offset,
src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
size * sizeof(T), n_seg - 1);
}
template <typename T>
__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
const PositionInCore &position,
const Shape &shape_full) {
int n_offset = shape_full.h * shape_full.w * shape_full.c;
int h_offset = shape_full.w * shape_full.c;
int w_offset = shape_full.c;
int n_seg = position.n_end - position.n_start;
int h_seg = position.h_end - position.h_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst, src + position.n_start * n_offset +
position.h_start * h_offset + position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1);
}
// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
template <typename T>
__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
for (int i = 0; i < shape_seg.n; ++i) {
__bang_transpose(dst, src, align_hw, align_c);
dst += align_hw * align_c;
src += align_hw * align_c;
}
}
template <typename T>
__mlu_func__ void psamaskCollectForward(
const T *x_dram, T *y_dram, const PositionInCore &position,
const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *x_nram = (T *)buf;
T *y_nram =
x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(x_nram, x_dram, position, x_full);
// fill zeros to output
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T));
__bang_write_value(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
int y_w_offset = shape_seg.c;
int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
int y_c_offset = 1;
int x_h_offset = shape_seg.w * x_full.c;
int x_w_offset = x_full.c;
int x_c_offset = 1;
int x_start = 0;
int y_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int y_offset = y_start;
int x_offset = x_start;
y_offset += hidx * y_h_offset + widx * y_w_offset;
x_offset += hidx * x_h_offset + widx * x_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = x_full.h + half_h_mask - h_abs < h_mask
? x_full.h + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = x_full.w + half_w_mask - w_abs < w_mask
? x_full.w + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
w_abs - half_w_mask) *
y_c_offset;
x_offset += (hstart * w_mask + wstart) * x_c_offset;
int count = wend - wstart;
__memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
}
}
y_start += y_n_offset;
x_start += x_n_offset;
}
storeDataFromNramToDram(y_dram, y_nram, position, y_full);
}
template <typename T>
__mlu_func__ void psamaskDistributeForward(
const T *x_dram, T *y_dram, const PositionInCore &position,
const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *x_nram = (T *)buf;
T *y_nram_temp =
x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(x_nram, x_dram, position, x_full);
// fill zeros to output
int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__bang_write_value(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c;
int y_w_offset = align_c;
int y_c_offset = 1;
int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
int x_h_offset = shape_seg.w * x_full.c;
int x_w_offset = x_full.c;
int x_c_offset = 1;
int h_feature = y_full.h;
int w_feature = y_full.w;
int y_start = 0;
int x_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int y_offset = y_start;
int x_offset = x_start;
y_offset += hidx * y_h_offset + widx * y_w_offset;
x_offset += hidx * x_h_offset + widx * x_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
w_abs - half_w_mask) *
y_c_offset;
x_offset += (hstart * w_mask + wstart) * x_c_offset;
int count = wend - wstart;
__memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
}
}
y_start += y_n_offset;
x_start += x_n_offset;
}
// transpose y
T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
transposeData(y_nram, y_nram_temp, y_seg);
swap(align_c, align_hw);
// store y from nram to dram
int y_n_offset_full = y_full.h * y_full.w * y_full.c;
int y_w_offset_full = y_full.c;
int y_c_offset_full = 1;
int y_dram_start =
position.n_start * y_n_offset_full +
(position.h_start * y_full.w + position.w_start) * y_c_offset_full;
int y_nram_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
__memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
y_w_offset_full * sizeof(T), align_c * sizeof(T),
h_feature * w_feature - 1);
}
}
template <typename T>
__mlu_func__ void psamaskCollectBackward(
const T *dy_dram, T *dx_dram, const PositionInCore &position,
const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *dy_nram = (T *)buf;
T *dx_nram =
dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
// fill zeros to output
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T));
__bang_write_value(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
int dy_w_offset = dy_full.c;
int dy_c_offset = 1;
int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
int dx_h_offset = shape_seg.w * dx_full.c;
int dx_w_offset = dx_full.c;
int dx_c_offset = 1;
int h_feature = dy_full.h;
int w_feature = dy_full.w;
int dy_start = 0;
int dx_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int dy_offset = dy_start;
int dx_offset = dx_start;
dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
// feature-indexed
dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
w_abs - half_w_mask) *
dy_c_offset;
dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
int count = wend - wstart;
__memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
}
}
dy_start += dy_n_offset;
dx_start += dx_n_offset;
}
storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
}
template <typename T>
__mlu_func__ void psamaskDistributeBackward(
const T *dy_dram, T *dx_dram, const PositionInCore &position,
const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
// load dy from dram to nram
T *dy_nram_temp = (T *)buf;
int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
int dy_c_offset_full = 1;
int h_feature = dy_full.h;
int w_feature = dy_full.w;
int align_c =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
int dy_dram_start =
position.n_start * dy_n_offset_full +
(position.h_start * w_feature + position.w_start) * dy_c_offset_full;
int dy_nram_start = 0;
for (int i = 0; i < shape_seg.n; ++i) {
int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
__memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
align_c * sizeof(T), dy_full.c * sizeof(T),
h_feature * w_feature - 1);
}
T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
transposeData(dy_nram, dy_nram_temp, dy_seg);
swap(align_c, align_hw);
// fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
(T)0);
int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
int dy_w_offset_seg = align_c;
int dy_c_offset_seg = 1;
int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
int dx_h_offset_seg = shape_seg.w * shape_seg.c;
int dx_w_offset_seg = shape_seg.c;
int dx_c_offset_seg = 1;
int dy_start = 0;
int dx_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int dy_offset = dy_start;
int dx_offset = dx_start;
dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
// feature-indexed
dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
w_abs - half_w_mask) *
dy_c_offset_seg;
dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
int count = wend - wstart;
__memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
}
}
dy_start += dy_n_offset_seg;
dx_start += dx_n_offset_seg;
}
storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
}
template <typename T>
__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
const Shape &input_full, const Shape &output_full,
LimitParam &limit, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition,
const bool is_forward, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const int n_per_core,
const int h_per_core, const int n_per_cluster,
const int h_per_cluster) {
PositionInCore position_full;
PositionInCore position_seg;
position_full.w_start = 0;
position_full.w_end = output_full.w;
int n_num_in_cluster = n_per_cluster;
int h_num_in_cluster = h_per_cluster;
switch (cluster_partition) {
case PARTITION_N: {
position_full.h_start = 0;
position_full.h_end = input_full.h;
position_full.n_start = taskIdY * n_per_cluster;
int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
if (taskIdY >= cluster_need) return;
int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
n_num_in_cluster =
(taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
position_full.n_end = position_full.n_start + n_num_in_cluster;
}; break;
case PARTITION_H: {
position_full.n_start = 0;
position_full.n_end = input_full.n;
position_full.h_start = taskIdY * h_per_cluster;
int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
if (taskIdY >= cluster_need) return;
int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
h_num_in_cluster =
(taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
position_full.h_end = position_full.h_start + h_num_in_cluster;
}; break;
}
switch (core_partition) {
case PARTITION_N: {
position_full.n_start += taskIdX * n_per_core;
int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
if (taskIdX >= core_need) return;
int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
position_full.n_end =
position_full.n_start +
((taskIdX == core_need - 1) ? n_remainder : n_per_core);
}; break;
case PARTITION_H: {
position_full.h_start += taskIdX * h_per_core;
int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
if (taskIdX >= core_need) return;
int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
position_full.h_end =
position_full.h_start +
((taskIdX == core_need - 1) ? h_remainder : h_per_core);
}; break;
}
// the count of n ,h and w need to be processed in the current core
int shape_core_n = position_full.n_end - position_full.n_start;
int shape_core_h = position_full.h_end - position_full.h_start;
int shape_core_w = input_full.w;
limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
// load the data to nram according to the limit
for (int nidx = position_full.n_start; nidx < position_full.n_end;
nidx += limit.n) {
position_seg.n_start = nidx;
position_seg.n_end =
position_seg.n_start + (position_full.n_end - nidx < limit.n
? position_full.n_end - nidx
: limit.n);
for (int hidx = position_full.h_start; hidx < position_full.h_end;
hidx += limit.h) {
position_seg.h_start = hidx;
position_seg.h_end =
position_seg.h_start + (position_full.h_end - hidx < limit.h
? position_full.h_end - hidx
: limit.h);
for (int widx = position_full.w_start; widx < position_full.w_end;
widx += limit.w) {
position_seg.w_start = widx;
position_seg.w_end =
position_seg.w_start + (position_full.w_end - widx < limit.w
? position_full.w_end - widx
: limit.w);
// record the segment of output except the size of channel
// channel segments of output and input are the same
Shape shape_seg;
shape_seg.n = position_seg.n_end - position_seg.n_start;
shape_seg.h = position_seg.h_end - position_seg.h_start;
shape_seg.w = position_seg.w_end - position_seg.w_start;
shape_seg.c = output_full.c;
switch (psa_type) {
case COLLECT: {
if (is_forward) {
psamaskCollectForward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg, h_mask,
w_mask, half_h_mask, half_w_mask);
} else {
psamaskCollectBackward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg, h_mask,
w_mask, half_h_mask, half_w_mask);
}
} break;
case DISTRIBUTE: {
if (is_forward) {
psamaskDistributeForward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg,
h_mask, w_mask, half_h_mask,
half_w_mask);
} else {
psamaskDistributeBackward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg,
h_mask, w_mask, half_h_mask,
half_w_mask);
}
} break;
}
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelPsamaskForward(
const T *x, T *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
if (coreId == 0x80) {
return;
}
Shape x_full, y_full;
x_full.n = batch;
x_full.h = h_feature;
x_full.w = w_feature;
x_full.c = x_c;
y_full.n = batch;
y_full.h = h_feature;
y_full.w = w_feature;
y_full.c = y_c;
LimitParam limit;
limit.n = limit_n_seg;
limit.h = limit_h_seg;
limit.w = limit_w_seg;
psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
n_per_core, h_per_core, n_per_cluster, h_per_cluster);
}
template <typename T>
__mlu_global__ void MLUUnion1KernelPsamaskBackward(
const T *dy, T *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
if (coreId == 0x80) {
return;
}
Shape dy_full, dx_full;
dx_full.n = batch;
dx_full.h = h_feature;
dx_full.w = w_feature;
dx_full.c = dx_c;
dy_full.n = batch;
dy_full.h = h_feature;
dy_full.w = w_feature;
dy_full.c = dy_c;
LimitParam limit;
limit.n = limit_n_seg;
limit.h = limit_h_seg;
limit.w = limit_w_seg;
psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
cluster_partition, false, h_mask, w_mask, half_h_mask,
half_w_mask, n_per_core, h_per_core, n_per_cluster,
h_per_cluster);
}
void KernelPsamaskForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *x, void *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
static_cast<const float *>(x), static_cast<float *>(y), psa_type,
core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
}
void KernelPsamaskBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *dy, void *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
}
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef PSAMASK_UTILS_HPP_
#define PSAMASK_UTILS_HPP_
typedef
enum
{
COLLECT
=
0
,
DISTRIBUTE
=
1
,
}
PsamaskType
;
typedef
enum
{
PARTITION_N
=
0
,
PARTITION_H
=
1
,
}
DimPartitionType
;
struct
PartitionSeg
{
int
h_per_cluster
;
int
n_per_cluster
;
int
h_per_core
;
int
n_per_core
;
DimPartitionType
cluster_partition
;
DimPartitionType
core_partition
;
};
struct
Shape
{
int
n
;
int
h
;
int
w
;
int
c
;
};
struct
LimitParam
{
int
n
;
int
h
;
int
w
;
};
struct
PositionInCore
{
int
n_start
;
int
n_end
;
int
h_start
;
int
h_end
;
int
w_start
;
int
w_end
;
};
#endif // PSAMASK_UTILS_HPP_
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ROI_OFFSET 5
__nram__ char buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void bilinearInterpolate(const int input_height,
const int input_width, T y, T x, T *w1,
T *w2, T *w3, T *w4, int *x_low,
int *x_high, int *y_low, int *y_high,
bool *empty) {
// deal with cases that inverse elements are of feature map boundary
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
*empty = true;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low_ = int(y);
int x_low_ = int(x);
if (y_low_ >= input_height - 1) {
*y_high = y_low_ = input_height - 1;
y = (T)y_low_;
} else {
*y_high = y_low_ + 1;
}
if (x_low_ >= input_width - 1) {
*x_high = x_low_ = input_width - 1;
x = T(x_low_);
} else {
*x_high = x_low_ + 1;
}
*y_low = y_low_;
*x_low = x_low_;
T ly = y - y_low_;
T lx = x - x_low_;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
T *nram_out, const int roi_bin_grid_h,
const int roi_bin_grid_w, const T roi_start_h,
const T roi_start_w, const int ph,
const int pw, const T bin_size_h,
const T bin_size_w, const float count,
const int input_height, const int input_width,
const int channels, const int cyc_num,
const int max_elements) {
int cyc_channel = max_elements;
for (int i = 0; i < cyc_num; i++) {
int real_channel =
(i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
__bang_write_zero(nram_out, align_channel);
uint32_t real_size = real_channel * sizeof(T);
int iy, ix;
for (iy = 0; iy < roi_bin_grid_h; iy++) {
// 1. compute the coordinates of the y axis in the current roi_bin_grid_h
T y = roi_start_h + ph * bin_size_h +
(T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
for (ix = 0; ix < roi_bin_grid_w; ix++) {
// 2. compute the coordinates of the x axis in the current
// roi_bin_grid_w
T x = roi_start_w + pw * bin_size_w +
(T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
// 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
// and y_high) and weight (x_low and x_high) of input feature map in
// the current roi bin grid, and the flag (empty) which shows if x, y
// are out of input feature map ranges
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bool empty = false;
bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
&x_low, &x_high, &y_low, &y_high, &empty);
// 4. compute interpolation of the current roi bin grid
// tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
// to compute the interpolation, and then reused to compute
// the argmax_x and argmax_y.
T *tmp_cyc1 = nram_in + cyc_channel;
T *tmp_cyc2 = nram_in + cyc_channel * 2;
T *tmp_cyc3 = nram_in + cyc_channel * 3;
T *tmp_cyc4 = nram_in + cyc_channel * 4;
if (empty) { // exits abnormal values
__bang_write_zero(nram_in, align_channel);
} else {
__bang_write_zero(nram_in, align_channel);
uint32_t offset1 = (y_low * input_width + x_low) * channels;
uint32_t offset2 = (y_low * input_width + x_high) * channels;
uint32_t offset3 = (y_high * input_width + x_low) * channels;
uint32_t offset4 = (y_high * input_width + x_high) * channels;
T *input1 = (T *)input_core + offset1 + i * cyc_channel;
T *input2 = (T *)input_core + offset2 + i * cyc_channel;
T *input3 = (T *)input_core + offset3 + i * cyc_channel;
T *input4 = (T *)input_core + offset4 + i * cyc_channel;
// load the four pixels (p1, p2, p3 and p4) of input feature map to
// compute interpolation
__memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
}
// 5. compute sum value and corresponding coordinates of x axis and y
// axis. Update the sum value.
__bang_add(nram_out, nram_in, nram_out, align_channel);
} // loop_roi_grid_w
} // loop_roi_grid_h
T count_value = (T)(1.0 / count);
__bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num
}
template <typename T>
__mlu_func__ void roialignForwardAvg(
T *input, T *rois, T *output, const bool aligned, const int channels,
const int pooled_height, const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio, const T spatial_scale,
const int num_rois) {
// find limit for channel, the nram space is divided to 6 parts that are
// input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
// max_elements : 300 : float datatype : 27296, half datatype : 54592
// max_elements : 200 : float datatype : 16384, half datatype : 32768
int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
T offset = aligned ? (T)0.5 : (T)0.0;
int task_num = num_rois * pooled_height * pooled_width;
T *nram_out = (T *)buffer;
T *nram_in = nram_out + max_elements;
if (task_num < taskDim) {
if (taskId >= task_num) {
return;
}
}
for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
if (bin_idx >= task_num) {
return;
}
// (n,ph.pw) is a c in the pooled output
int pw = bin_idx % pooled_width;
int ph = (bin_idx / pooled_width) % pooled_height;
int n = bin_idx / pooled_width / pooled_height;
T *roi_id_tmp = rois + n * ROI_OFFSET;
// 1. compute width and height of roi region.
int batch_idx = (int)roi_id_tmp[0];
T roi_x1 = roi_id_tmp[1];
T roi_y1 = roi_id_tmp[2];
T roi_x2 = roi_id_tmp[3];
T roi_y2 = roi_id_tmp[4];
T roi_start_w = roi_x1 * spatial_scale - offset;
T roi_start_h = roi_y1 * spatial_scale - offset;
T roi_end_w = roi_x2 * spatial_scale - offset;
T roi_end_h = roi_y2 * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) {
roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
}
// 2. compute float-type width and height of roi bin region.
T bin_size_w = (T)roi_width / (T)pooled_width;
T bin_size_h = (T)roi_height / (T)pooled_height;
// 3. compute int-type width and height of roi bin region.
int roi_bin_grid_h, roi_bin_grid_w;
roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: int(ceilf(roi_height / pooled_height));
roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: int(ceilf(roi_width / pooled_width));
float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1.0);
T *input_core = input + batch_idx * channels * input_width * input_height;
T *output_core = output + bin_idx * channels;
// 4. compute avg value and corresponding coordinates of x axis and y axis.
computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
bin_size_w, count, input_height, input_width, channels,
cyc_num, max_elements);
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
const void *input, const void *rois, const int channels, const bool aligned,
const int pooled_height, const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio, const float spatial_scale,
const int num_rois, const cnrtDataType_t data_type, void *output) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_type) {
case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, (half)spatial_scale,
num_rois);
}; break;
case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
aligned, channels, pooled_height, pooled_width,
input_height, input_width, sampling_ratio,
(float)spatial_scale, num_rois);
}; break;
default:
break;
}
return;
}
} // namespace forward
namespace backward {
__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
float x, float *w1, float *w2,
float *w3, float *w4, int *x_low,
int *x_high, int *y_low,
int *y_high) {
if (y < -1.0 || y > height || x < -1.0 || x > width) {
*w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
*x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
return;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
*y_low = (int)y;
*x_low = (int)x;
if (*y_low >= height - 1) {
*y_high = height - 1, *y_low = height - 1;
y = (float)(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= width - 1) {
*x_high = width - 1, *x_low = width - 1;
x = (float)(*x_low);
} else {
*x_high = *x_low + 1;
}
float ly = y - *y_low, lx = x - *x_low;
float hy = 1.0 - ly, hx = 1.0 - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void unionRoiAlignBp(
T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
const int wi, const int c, const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio, const bool aligned) {
int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
int deal_all = boxes_num * hi * wi;
int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
for (int i = 0; i < deal_this_core; ++i) {
int bhw_id = i * taskDim + taskId;
int box_id = bhw_id / (hi * wi);
int ih = (bhw_id / wi) % hi;
int iw = bhw_id % wi;
T *box = boxes + box_id * 5;
int image_id = (int)box[0];
T *image_offset = grads_image + image_id * ho * wo * c;
T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
float offset = aligned ? 0.5 : 0.0;
float x1 = box[1] * spatial_scale - offset;
float y1 = box[2] * spatial_scale - offset;
float x2 = box[3] * spatial_scale - offset;
float y2 = box[4] * spatial_scale - offset;
float roi_width = x2 - x1;
float roi_height = y2 - y1;
if (!aligned) {
roi_width = (roi_width > 1.0) ? roi_width : 1.0;
roi_height = (roi_height > 1.0) ? roi_height : 1.0;
}
float bin_size_h = roi_height / hi;
float bin_size_w = roi_width / wi;
int roi_grid_h =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
int roi_grid_w =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
const T count = roi_grid_h * roi_grid_w;
if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
for (int iy = 0; iy < roi_grid_h; ++iy) {
const float y =
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
for (int ix = 0; ix < roi_grid_w; ++ix) {
const float x =
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
float w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c);
} // x_low && y_low
} // ix
} // iy
} else {
for (int iy = 0; iy < roi_grid_h; ++iy) {
const float y =
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
for (int ix = 0; ix < roi_grid_w; ++ix) {
const float x =
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
float w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
int deal_once =
PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
int c_repeat = c / deal_once + (int)(c % deal_once != 0);
for (int i = 0; i < c_repeat; ++i) {
int deal_c = deal_once;
int align_c = deal_once;
if (i == c_repeat - 1) {
deal_c = c - i * deal_once;
align_c = c_align - i * deal_once;
}
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
} // for c_repeat
} // x_low >= 0 && y_low >= 0
} // ix
} // iy
} // if c
} // i
}
__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
const void *grads, const void *boxes, void *grads_image,
const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
const int c, const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio, const bool aligned) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (dtype) {
case CNRT_FLOAT16: {
unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
sampling_ratio, aligned);
}; break;
case CNRT_FLOAT32: {
unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
sampling_ratio, aligned);
}; break;
default: { return; }
}
}
} // namespace backward
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *rois, const int channels,
const bool aligned, const int pooled_height,
const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio,
const float spatial_scale, const int num_rois,
void *output) {
forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
input, rois, channels, aligned, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
}
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t dtype,
const void *grads, const void *boxes,
void *grads_image, const int boxes_num,
const int hi, const int wi, const int c,
const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio,
const bool aligned) {
backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
spatial_scale, sampling_ratio, aligned);
}
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "roi_align_rotated_utils.hpp"
#define ROI_OFFSET 6
#define SAMPLING_NUM 4
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__mlu_func__ void bilinearInterpolate(const int input_height,
const int input_width, T x, T y, T *w1,
T *w2, T *w3, T *w4, int *x_low,
int *x_high, int *y_low, int *y_high,
bool *empty) {
// deal with case that the point is out of feature map boundary
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
*empty = true;
return;
}
if (y <= 0) y = (T)0;
if (x <= 0) x = (T)0;
*y_low = int(y);
*x_low = int(x);
if (*y_low >= input_height - 1) {
*y_high = *y_low = input_height - 1;
y = (T)(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= input_width - 1) {
*x_high = *x_low = input_width - 1;
x = T(*x_low);
} else {
*x_high = *x_low + 1;
}
T ly = y - *y_low;
T lx = x - *x_low;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx;
*w2 = hy * lx;
*w3 = ly * hx;
*w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
const RoiAlignRotatedParams ¶ms,
int *batch_idx, int *roi_n, int *pw, int *ph,
T *roi_center_x, T *roi_center_y, T *roi_width,
T *roi_height, T *theta) {
T offset = params.aligned ? (T)0.5 : (T)0.0;
*pw = bin_i % params.pooled_width;
*ph = (bin_i / params.pooled_width) % params.pooled_height;
*roi_n = bin_i / params.pooled_width / params.pooled_height;
const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
*batch_idx = (int)roi_info[0];
*roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
*roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
*roi_width = roi_info[3] * (T)params.spatial_scale;
*roi_height = roi_info[4] * (T)params.spatial_scale;
*theta = roi_info[5];
if (params.clockwise) {
*theta = -(*theta);
}
if (!params.aligned) {
*roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
*roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
}
}
template <typename T>
__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
const T *rois_dram, const int batch,
const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams ¶ms,
T *output_dram) {
int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
channel_align = CEIL_ALIGN(channel_align, align_base_128);
T *nram_out = (T *)nram_buffer;
T *nram_ping = nram_out + channel_align;
T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
int bin_first = taskId;
int bin_end = rois_num * params.pooled_height * params.pooled_width;
for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
T roi_center_x, roi_center_y, roi_width, roi_height, theta;
int batch_idx, roi_n, pw, ph;
getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
&roi_center_x, &roi_center_y, &roi_width, &roi_height,
&theta);
T bin_size_h = roi_height / params.pooled_height;
T bin_size_w = roi_width / params.pooled_width;
int roi_bin_grid_h =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_height / params.pooled_height);
int roi_bin_grid_w =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_width / params.pooled_width);
T roi_start_y = -roi_height / 2;
T roi_start_x = -roi_width / 2;
const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1;
T cos_theta = std::cos(theta);
T sin_theta = std::sin(theta);
T zero_sign = 1.0f / bin_dim;
bool is_first_sample = true;
int src_offset = 0;
int dst_offset = 0;
int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
__bang_write_value(nram_out, channel_align, (T)0);
c_rem = channel - c_offset;
c_slice = channel_align > c_rem ? c_rem : channel_align;
c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
is_first_sample = true;
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
const T yy = roi_start_y + ph * bin_size_h +
T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
const T xx = roi_start_x + pw * bin_size_w +
T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
int sample_i = iy * roi_bin_grid_w + ix;
T y = yy * cos_theta - xx * sin_theta + roi_center_y;
T x = yy * sin_theta + xx * cos_theta + roi_center_x;
T w1, w2, w3, w4;
bool empty = false;
int x_low, x_high, y_low, y_high;
bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high, &empty);
/*******************************************************
| ping | pong |
|------|-----|-----|-----|-----|-----|-----|-----|-----|
|output| p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 |
|------|-----|-----|-----|-----|-----|-----|-----|-----|
********************************************************/
if (is_first_sample && !empty) {
// load input data from dram to nram
__bang_write_value(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
src_offset =
(batch_idx * height * width + y_low * width + x_low) * channel +
c_offset;
dst_offset = 0;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset = (batch_idx * height * width + y_low * width + x_high) *
channel +
c_offset;
dst_offset = c_slice_align;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset = (batch_idx * height * width + y_high * width + x_low) *
channel +
c_offset;
dst_offset = c_slice_align * 2;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + y_high * width + x_high) *
channel +
c_offset;
dst_offset = c_slice_align * 3;
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
// load next input data to nram
if (sample_i + 1 < bin_dim) {
int p_iy = (sample_i + 1) / roi_bin_grid_w;
int p_ix = (sample_i + 1) % roi_bin_grid_w;
const T p_yy = roi_start_y + ph * bin_size_h +
T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
const T p_xx = roi_start_x + pw * bin_size_w +
T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
T p_w1, p_w2, p_w3, p_w4;
bool p_empty = false;
int p_x_low, p_x_high, p_y_low, p_y_high;
bilinearInterpolate(height, width, p_x, p_y, &p_w1, &p_w2, &p_w3,
&p_w4, &p_x_low, &p_x_high, &p_y_low, &p_y_high,
&p_empty);
pongc_slice = c_slice;
pongc_slice_align = c_slice_align;
if (!p_empty) {
__bang_write_value(nram_pong, SAMPLING_NUM * pongc_slice_align,
(T)0);
src_offset =
(batch_idx * height * width + p_y_low * width + p_x_low) *
channel +
c_offset;
dst_offset = 0;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + p_y_low * width + p_x_high) *
channel +
c_offset;
dst_offset = pongc_slice_align;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + p_y_high * width + p_x_low) *
channel +
c_offset;
dst_offset = pongc_slice_align * 2;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
src_offset =
(batch_idx * height * width + p_y_high * width + p_x_high) *
channel +
c_offset;
dst_offset = pongc_slice_align * 3;
__memcpy(nram_pong + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
}
T *tmp_sum = nram_ping + 3 * c_slice_align;
if (empty) {
__bang_write_value(tmp_sum, c_slice_align, T(0));
} else {
__bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
__bang_mul_scalar(nram_ping + c_slice_align,
nram_ping + c_slice_align, w2, c_slice_align);
__bang_mul_scalar(nram_ping + 2 * c_slice_align,
nram_ping + 2 * c_slice_align, w3, c_slice_align);
__bang_mul_scalar(nram_ping + 3 * c_slice_align,
nram_ping + 3 * c_slice_align, w4, c_slice_align);
__bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
1, SAMPLING_NUM, 1, 1);
}
__bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
swap(nram_ping, nram_pong);
__asm__ volatile("sync;");
is_first_sample = false;
}
}
__bang_mul_scalar(nram_out, nram_out, zero_sign, c_slice_align);
// store the result to dram
int output_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel +
c_offset;
__memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
NRAM2GDRAM);
}
}
}
template <typename T>
__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
const T *rois_dram, const int batch,
const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams ¶ms,
T *bottom_grad_dram) {
int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
int channel_align = CEIL_ALIGN(channel, align_base_128);
unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
int c_limit = max_element >> 2;
c_limit = c_limit > channel_align ? channel_align : c_limit;
T *nram_ping = (T *)nram_buffer;
T *nram_pong = nram_ping + 2 * c_limit;
T *nram_output = nullptr;
int bin_first = taskId;
int bin_end = rois_num * params.pooled_height * params.pooled_width;
bool is_first_bin = true;
T roi_center_x, roi_center_y, roi_width, roi_height, theta;
int batch_idx, roi_n, pw, ph;
T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
pong_theta;
int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
&roi_center_x, &roi_center_y, &roi_width, &roi_height,
&theta);
T bin_size_h = roi_height / params.pooled_height;
T bin_size_w = roi_width / params.pooled_width;
int roi_bin_grid_h =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_height / params.pooled_height);
int roi_bin_grid_w =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_width / params.pooled_width);
T roi_start_y = -roi_height / 2;
T roi_start_x = -roi_width / 2;
const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1;
T cos_theta = std::cos(theta);
T sin_theta = std::sin(theta);
T zero_sign = 1.0f / bin_dim;
int c_rem, c_slice, pongc_slice, c_offset;
c_rem = channel;
c_offset = 0;
/****************************************
| ping | pong |
|---------|---------|---------|---------|
| input | output | input | output |
|---------|---------|---------|---------|
*****************************************/
if (is_first_bin) {
// load the first top_grad to nram
c_slice = c_limit < c_rem ? c_limit : c_rem;
int top_grad_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel;
__memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
GDRAM2NRAM);
}
nram_output = nram_ping + c_limit;
while (c_rem > 0) {
c_slice = c_slice < c_rem ? c_slice : c_rem;
// load the next top_grad to nram
if (c_rem - c_slice > 0) {
// load the rest channels to nram
pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
int top_grad_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel +
c_offset + c_slice;
__memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
} else if (bin_i + taskDim < bin_end) {
// load next bin's data to nram
getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
&pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
&pong_roi_center_y, &pong_roi_width, &pong_roi_height,
&pong_theta);
pongc_slice = c_limit < channel ? c_limit : channel;
int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
params.pooled_width +
pong_pw) *
channel;
__memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
// comput the output in a single bin
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
const T yy = roi_start_y + ph * bin_size_h +
T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
const T xx = roi_start_x + pw * bin_size_w +
T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
T y = yy * cos_theta - xx * sin_theta + roi_center_y;
T x = yy * sin_theta + xx * cos_theta + roi_center_x;
T w1, w2, w3, w4;
bool empty = false;
int x_low, x_high, y_low, y_high;
bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high, &empty);
if (empty) {
continue;
} else {
__bang_mul_scalar(nram_output, nram_ping, w1 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w2 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w3 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w4 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
}
}
}
swap(nram_ping, nram_pong);
c_rem -= c_slice;
c_offset += c_slice;
__asm__ volatile("sync;");
}
is_first_bin = false;
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
const void *features, const void *rois, void *output, const int batch,
const int height, const int width, const int channel, const int rois_num,
const RoiAlignRotatedParams rroiAlignParams,
const cnrtDataType_t data_type) {
if (0x80 == coreId) {
return;
}
if (data_type == CNRT_FLOAT32) {
roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(float *)output);
} else {
roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
channel, rois_num, rroiAlignParams, (half *)output);
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
const void *top_grad, const void *rois, void *bottom_grad, const int batch,
const int height, const int width, const int channel, const int rois_num,
const RoiAlignRotatedParams rroiAlignParams,
const cnrtDataType_t data_type) {
if (0x80 == coreId) {
return;
}
if (data_type == CNRT_FLOAT32) {
roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(float *)bottom_grad);
} else {
roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(half *)bottom_grad);
}
}
void KernelRoiAlignRotatedForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *features, const void *rois,
void *output, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams) {
MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
features, rois, output, batch, height, width, channel, rois_num,
roiAlignRotatedParams, d_type);
}
void KernelRoiAlignRotatedBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *top_grad, const void *rois,
void *bottom_grad, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams) {
MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
roiAlignRotatedParams, d_type);
}
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
#define ROI_ALIGN_ROTATED_UTILS_HPP_
struct
RoiAlignRotatedParams
{
int
pooled_height
;
int
pooled_width
;
int
sample_ratio
;
float
spatial_scale
;
bool
aligned
;
bool
clockwise
;
};
#endif // ROI_ALIGN_ROTATED_UTILS_HPP_
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ALIGN_SIZE 64
#define PIPELINE_COMMON_NUM 2
#define PIPELINE_PINGPONG_NUM 10
__nram__ char nram_buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
int width, int channels, int p_height,
int p_width, T spatial_scale, int *bin_x1,
int *bin_y1, int *bin_x2, int *bin_y2,
int *bin_wdim, int *bin_hdim, int *bin_dims,
T **input_base, bool *is_empty) {
int pw = bin_i % p_width;
int ph = (bin_i / p_width) % p_height;
int roi_n = bin_i / p_width / p_height;
/*roi*/
const T *roi_info = rois_v + roi_n * 5; // {{batch, x1, y1, x2, y2},,,}
int batch_index = (int)roi_info[0];
int roi_x1 = round(roi_info[1] * spatial_scale);
int roi_y1 = round(roi_info[2] * spatial_scale);
int roi_x2 = round(roi_info[3] * spatial_scale);
int roi_y2 = round(roi_info[4] * spatial_scale);
int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
/*bin*/
T bin_w = (T)roi_w / (T)p_width;
T bin_h = (T)roi_h / (T)p_height;
*bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
*bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
*bin_x1 = *bin_x1 < width ? *bin_x1 : width;
*bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
*bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
*bin_y1 = *bin_y1 < height ? *bin_y1 : height;
*bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
*bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
*bin_x2 = *bin_x2 < width ? *bin_x2 : width;
*bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
*bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
*bin_y2 = *bin_y2 < height ? *bin_y2 : height;
*input_base = input_v + batch_index * height * width * channels;
*bin_wdim = *bin_x2 - *bin_x1;
*bin_hdim = *bin_y2 - *bin_y1;
*bin_dims = (*bin_hdim) * (*bin_wdim);
*is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
}
template <typename T>
__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
int channels, int height, int width,
int p_height, int p_width, int rois_num,
T spatial_scale, T *output_v, int *argmax) {
/*
* NRAM partition
* |---------------------------------------------------|
* | ping |
* |---------------------------------------------------|
* | pong |
* |---------------------------------------------------|
* | out |
* |---------------------------------------------------|
* | argmax |
* |---------------------------------------------------|
* | a |
* |---------------------------------------------------|
* | b |
* |---------------------------------------------------|
*/
uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
uint32_t t_size = sizeof(T);
uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
uint32_t channels_align = PAD_UP(channels, float_div);
uint32_t nram_limit = PAD_DOWN(
(MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
// nram PING/PONG, output, argamx, a, b
float *nram_ping = (float *)nram_buffer;
float *nram_pong = (float *)nram_buffer + nram_limit;
float *nram_out = (float *)nram_buffer + 2 * nram_limit;
float *nram_argmax = nram_out + channels_align;
float *nram_a = nram_out + 2 * channels_align;
float *nram_b = nram_out + 3 * channels_align;
uint32_t c_bins_num = rois_num * p_height * p_width;
uint32_t task_bins = c_bins_num / taskDim;
uint32_t rem_bins = c_bins_num % taskDim;
if (taskId < rem_bins) {
task_bins += 1;
}
int bin_first =
(c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
int bins_loop = bin_first + task_bins;
T *input_base = NULL;
T *output_base = output_v + bin_first * channels;
int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
bool is_empty = false;
bool pong_is_empty = false;
bool is_first_bin = true;
uint32_t src_offset = 0;
uint32_t dst_offset = 0;
uint32_t nram_offset = 0;
uint32_t half_offset =
is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
float *nram_tmp = NULL;
uint32_t c_slice = 0;
uint32_t c_slice_align = 0;
uint32_t pongc_slice = 0;
uint32_t pongc_slice_align = 0;
for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
&bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
&input_base, &is_empty);
uint32_t c_rem = channels;
c_slice = nram_limit / bin_dims / float_div * float_div;
if (is_first_bin && !is_empty) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = PAD_UP(c_slice, float_div);
for (int h = bin_y1; h < bin_y2; h++) {
src_offset = (h * width + bin_x1) * channels;
nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
if (c_slice_align == channels) {
__memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
bin_wdim * c_slice * t_size, GDRAM2NRAM);
} else {
__memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
channels * t_size, bin_wdim - 1);
}
}
}
uint32_t c_offset = 0;
while (c_rem > 0) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = PAD_UP(c_slice, float_div);
/*__memcpy_async*/
if (c_rem - c_slice > 0 && !is_empty) {
pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
pongc_slice_align = PAD_UP(pongc_slice, float_div);
for (int h = bin_y1; h < bin_y2; h++) {
src_offset = (h * width + bin_x1) * channels + c_offset;
nram_offset =
(h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
__memcpy_async((T *)nram_pong + nram_offset,
(T *)input_base + src_offset + c_slice,
pongc_slice * t_size, GDRAM2NRAM,
pongc_slice_align * t_size, channels * t_size,
bin_wdim - 1);
}
} else if (bin_i + 1 < bins_loop) {
getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
&pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
&pbin_dims, &input_base, &pong_is_empty);
pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
pongc_slice = pongc_slice > channels ? channels : pongc_slice;
pongc_slice_align = PAD_UP(pongc_slice, float_div);
if (!pong_is_empty) {
for (int h = pbin_y1; h < pbin_y2; h++) {
src_offset = (h * width + pbin_x1) * channels;
nram_offset =
(h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
if (pongc_slice_align == channels) {
__memcpy_async((T *)nram_pong + nram_offset,
(T *)input_base + src_offset,
pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
} else {
__memcpy_async((T *)nram_pong + nram_offset,
(T *)input_base + src_offset, pongc_slice * t_size,
GDRAM2NRAM, pongc_slice_align * t_size,
channels * t_size, pbin_wdim - 1);
}
}
}
}
if (is_empty) {
__bang_write_value((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
__bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
}
} else {
if (is_half) {
uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
__bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
bin_align64);
}
__bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
if (is_half) {
uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
__bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
}
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
/*compute max_index*/
__bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
c_slice_align, bin_hdim, bin_wdim, bin_hdim,
bin_wdim, 1, 1);
convertInt2Float((float *)nram_argmax, (float *)nram_a,
(int32_t *)nram_out, (float *)nram_b, c_slice_align);
/*compute input_h*/
for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
}
__bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align);
__bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align);
/*compute input_w*/
__bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align);
__bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align);
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
(float *)nram_out, (float *)nram_b, c_slice_align);
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_argmax, c_slice * sizeof(int32_t),
NRAM2GDRAM);
}
}
nram_tmp = nram_ping;
nram_ping = nram_pong;
nram_pong = nram_tmp;
c_offset += c_slice;
c_rem -= c_slice;
__asm__ volatile("sync;");
}
dst_offset += channels;
is_first_bin = false;
}
}
__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
const void *input_data,
const void *input_rois, int batch,
int channels, int height, int width,
int pooled_height, int pooled_width,
int rois_num, float spatial_scale,
void *output_data, int *argmax) {
switch (data_type) {
case CNRT_FLOAT16: {
MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
height, width, pooled_height, pooled_width, rois_num,
(half)spatial_scale, (half *)output_data, argmax);
}; break;
case CNRT_FLOAT32: {
MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
channels, height, width, pooled_height, pooled_width,
rois_num, (float)spatial_scale, (float *)output_data,
argmax);
}; break;
default: { break; }
}
}
} // namespace forward
namespace backward {
// Convert index of argmax from global grads_image to local bin in RoI. Vector
// operations do not support int type, so conversion from int to float is
// performed here.
__mlu_func__ void convertIndex(
int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
float *nram_atomic_add, float *nram_grads_image, int width, int height,
int wstart, int hstart, int w_compute, int h_compute, int align_c,
int channels, int loop_flag, int loop_id, int true_limit) {
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
// This step uses scalar division, because the above vector division causes
// rounding accuracy problem.
for (int i = 0; i < channels; ++i) {
*((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
}
// Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
// operation.
convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
align_c);
// Perform 'temp_result - hstart' operation
__bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c);
// Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
(float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation
__bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
wstart, align_c);
// Perform 'temp_result = h * w_compute + w' operation
__bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) {
__bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c);
}
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
align_c);
}
template <typename T>
__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
const int32_t *argmax, T *grads_image,
int channels, int height, int width,
int pooled_height, int pooled_width,
int rois_num, const T spatial_scale,
int high_precision) {
// Calculate the number of rois processed by each core
int bin_num = rois_num * pooled_height * pooled_width;
int loop =
(bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
int tid = taskId * loop;
if (bin_num % taskDim != 0) {
if (tid >= bin_num) {
return;
} else {
// last part is (bin_num - tid).
loop = bin_num - tid < loop ? bin_num - tid : loop;
}
}
int align_c = PAD_UP(channels, ALIGN_SIZE);
// Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
int data_size =
PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
(PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
2),
ALIGN_SIZE);
int hw_limit = data_size / align_c;
float *nram_grads = (float *)nram_buffer;
for (int idx = tid; idx < tid + loop; ++idx) {
// (n, ph, pw) is a C in the pooled output
int pw = idx % pooled_width;
int ph = (idx / pooled_width) % pooled_height;
int n = idx / pooled_width / pooled_height;
const T *offset_rois = (const T *)(rois + n * 5);
int roi_batch_ind = int(offset_rois[0]);
// Calculate the roi region on feature maps
int roi_start_w = round(offset_rois[1] * spatial_scale);
int roi_start_h = round(offset_rois[2] * spatial_scale);
int roi_end_w = round(offset_rois[3] * spatial_scale);
int roi_end_h = round(offset_rois[4] * spatial_scale);
// Force malformed rois to 1x1
int roi_width =
roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
int roi_height =
roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
T bin_size_h = (T)roi_height / (T)pooled_height;
T bin_size_w = (T)roi_width / (T)pooled_width;
// The corresponding bin region
int hstart = int(floor((T)ph * bin_size_h));
int wstart = int(floor((T)pw * bin_size_w));
int hend = int(ceil((T)(ph + 1) * bin_size_h));
int wend = int(ceil((T)(pw + 1) * bin_size_w));
// Add roi offsets and clip to input boundaries, min(max(A, B), C);
hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
hstart = hstart < height ? hstart : height;
hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
hend = hend < height ? hend : height;
wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
wstart = wstart < width ? wstart : width;
wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
wend = wend < width ? wend : width;
bool is_empty = (hend <= hstart) || (wend <= wstart);
if (!is_empty) {
int h_compute = hend - hstart;
int w_compute = wend - wstart;
int true_limit =
hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
int loop_int = (h_compute * w_compute) / true_limit;
int rem = (h_compute * w_compute) % true_limit;
int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
float *nram_grads_image = (float *)nram_atomic_add + align_c;
if (true_limit == h_compute * w_compute) {
/*
* NRAM partition
* |---------------------------------------------------|
* | grads |
* |---------------------------------------------------|
* | argmax |
* |---------------------------------------------------|
* | argmax_temp |
* |---------------------------------------------------|
* | atomic_add |
* |---------------------------------------------------|
* | grads_image |
* |---------------------------------------------------|
*/
// Load the data from GDRAM to NRAM.
__memcpy(
(T *)nram_grads + align_c * high_precision,
(const T *)grads +
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
// Perform pooling operation on NRAM.
convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
nram_atomic_add, nram_grads_image, width, height, wstart,
hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
__bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
(int32_t *)nram_argmax_int, align_c, h_compute,
w_compute, h_compute, w_compute, h_compute,
w_compute);
if (high_precision) {
__bang_float2half_rd((half *)nram_grads_image,
(float *)nram_grads_image,
h_compute * w_compute * align_c);
}
// Store the result on NRAM back to GDRAM.
for (int hc = 0; hc < h_compute; ++hc) {
for (int wc = 0; wc < w_compute; ++wc) {
T *dst = (T *)nram_atomic_add;
int grad_image_offset = (roi_batch_ind * height * width +
(hc + hstart) * width + wc + wstart) *
channels;
T *src1 = (T *)grads_image + grad_image_offset;
int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
__bang_atomic_add(dst, src1, src2, channels);
}
}
} else if (true_limit > 0) {
/*
* NRAM partition
* |---------------------------------------------------|
* | grads |
* |---------------------------------------------------|
* | argmax |
* |--------------------ping_pong----------------------|
* | argmax_temp | argmax_temp |
* |------------------------|--------------------------|
* | atomic_add | atomic_add |
* |------------------------|--------------------------|
* | grads_image | grads_image |
* |---------------------------------------------------|
*/
// Load the data from GDRAM to NRAM.
__memcpy(
(T *)nram_grads + align_c * high_precision,
(const T *)grads +
(n * pooled_height * pooled_width + ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
(n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
int ping_pong = 0;
int ping_pong_offset =
(MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
int size = (loop_id == loop_int) ? rem : true_limit;
if (size == 0) {
break;
}
// Perform pooling operation on NRAM.
nram_argmax_fp =
(int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
nram_grads_image = (float *)nram_atomic_add + align_c;
int loop_id_1 = loop_id;
int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
if (size_1 == 0) {
break;
}
convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
nram_atomic_add, nram_grads_image, width, height, wstart,
hstart, w_compute, h_compute, align_c, channels, 1,
loop_id_1, true_limit);
__bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
(int32_t *)nram_argmax_int, align_c, size_1, 1,
size_1, 1, size_1, 1);
if (high_precision) {
__bang_float2half_rd((half *)nram_grads_image,
(float *)nram_grads_image, size_1 * align_c);
}
// Store the result on NRAM back to GDRAM.
for (int index_size = 0; index_size < size; ++index_size) {
int h = (loop_id * true_limit + index_size) / w_compute;
int w = (loop_id * true_limit + index_size) % w_compute;
T *dst = (T *)nram_atomic_add;
T *grads_image_n =
(T *)grads_image + roi_batch_ind * height * width * channels;
T *src1 = (T *)grads_image_n +
((h + hstart) * width + (w + wstart)) * channels;
T *src2 = (T *)nram_grads_image + index_size * align_c;
__bang_atomic_add(dst, src1, src2, channels);
}
ping_pong = 1 - ping_pong;
}
} else {
/*
* NRAM partition
* |---------------------------------------------------|
* | grads |
* |---------------------------------------------------|
* | argmax |
* |--------------------ping_pong----------------------|
* | argmax_temp | argmax_temp |
* |------------------------|--------------------------|
* | atomic_add | atomic_add |
* |------------------------|--------------------------|
* | grads_image | grads_image |
* |---------------------------------------------------|
*/
int c_limit =
PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
(PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
ALIGN_SIZE);
int loop_int = channels / c_limit;
int rem = channels % c_limit;
int ping_pong = 0;
int ping_pong_offset =
(MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
int size = (loop_id == loop_int) ? rem : c_limit;
if (size == 0) {
break;
}
nram_argmax_fp =
(int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
nram_grads_image = (float *)nram_atomic_add + c_limit;
// This pipeline loads the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + c_limit * high_precision,
(const T *)grads +
n * pooled_height * pooled_width * channels +
ph * pooled_width * channels + pw * channels +
loop_id * c_limit,
size * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + c_limit * high_precision,
c_limit);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
n * pooled_height * pooled_width * channels +
ph * pooled_width * channels + pw * channels +
loop_id * c_limit,
size * sizeof(int32_t), GDRAM2NRAM);
for (int hc = 0; hc < h_compute; ++hc) {
for (int wc = 0; wc < w_compute; ++wc) {
// This pipeline performs pooling operation on NRAM.
convertIndex(
nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
nram_atomic_add, nram_grads_image, width, height, wstart + wc,
hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
__bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
(int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
1, 1);
if (high_precision) {
__bang_float2half_rd((half *)nram_grads_image,
(float *)nram_grads_image, c_limit);
}
// This pipeline stores the result on NRAM back to GDRAM.
T *dst = (T *)nram_atomic_add;
T *grads_image_n =
(T *)grads_image + roi_batch_ind * height * width * channels;
T *src1 = (T *)grads_image_n +
((hc + hstart) * width + (wc + wstart)) * channels +
loop_id * c_limit;
T *src2 = (T *)nram_grads_image;
__bang_atomic_add(dst, src1, src2, size);
}
}
ping_pong = 1 - ping_pong;
}
}
}
}
}
__mlu_global__ void MLUKernelRoiPoolBackward(
const void *grads, const void *rois, const int *argmax, void *grads_image,
int rois_num, int pooled_height, int pooled_width, int channels, int no,
int height, int width, const float spatial_scale,
const cnrtDataType_t k_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (k_dtype) {
case CNRT_FLOAT16: {
// Using the float type '__bang_max_pool_bp' instruction to increase the
// bit width.
const int high_precision = 1;
MLUUnion1Roipool((const half *)rois, (const half *)grads,
(const int32_t *)argmax, (half *)grads_image, channels,
height, width, pooled_height, pooled_width, rois_num,
(const half)spatial_scale, high_precision);
}; break;
case CNRT_FLOAT32: {
const int high_precision = 0;
MLUUnion1Roipool((const float *)rois, (const float *)grads,
(const int32_t *)argmax, (float *)grads_image, channels,
height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision);
}; break;
default: { break; }
}
}
} // namespace backward
void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *input_data, const void *input_rois,
const int batch, const int channels, const int height,
const int width, const int pooled_height,
const int pooled_width, const int rois_num,
const float spatial_scale, void *output_data,
int *argmax) {
forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
data_type, input_data, input_rois, batch, channels, height, width,
pooled_height, pooled_width, rois_num, spatial_scale, output_data,
argmax);
}
void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t k_dtype,
const void *grad_output_ptr, const void *rois_ptr,
const int *argmax_ptr, void *grad_input_ptr,
const int box_num, const int pooled_height,
const int pooled_width, const int channels,
const int batch, const int height, const int width,
const float spatial_scale) {
backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
pooled_height, pooled_width, channels, batch, height, width,
spatial_scale, k_dtype);
}
mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ROI_OFFSET 7
#define FLOAT_NRAM_BUFFER_NUM 14
#define HALF_NRAM_BUFFER_NUM 25
#define ALIGN_NUM 64
__nram__ char data_nram[MAX_NRAM_SIZE];
template <typename T>
__mlu_global__ void MLUUnion1KernelPtsIdxOfVoxels(
const int pool_method, const int boxes_num, const int pts_num,
const int max_pts_each_voxel, const int out_x, const int out_y,
const int out_z, const T *rois, const T *pts, int *pts_idx_of_voxels) {
// params (T)rois: (boxes_num, 7)
// params (T)pts: (3, pts_num)
// params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
// max_pts_each_voxel)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int nram_pts_num = 0;
if (sizeof(T) == sizeof(float)) {
nram_pts_num = PAD_DOWN(
(MAX_NRAM_SIZE / sizeof(float) / FLOAT_NRAM_BUFFER_NUM), ALIGN_NUM);
} else {
nram_pts_num = PAD_DOWN(
(MAX_NRAM_SIZE / sizeof(half) / HALF_NRAM_BUFFER_NUM), ALIGN_NUM);
}
char *X = NULL;
char *Y = NULL;
char *Z = NULL;
char *local_X = NULL;
char *local_Y = NULL;
char *local_Z = NULL;
char *nram_pts_in_flag = NULL;
float *temp_buffer1 = NULL;
float *temp_buffer2 = NULL;
float *temp_buffer3 = NULL;
float *temp_buffer4 = NULL;
float *temp_buffer5 = NULL;
float *nram_voxel_offset = NULL;
int *nram_pts_idx_seq = NULL;
float *fp_local_X = NULL;
float *fp_local_Y = NULL;
float *fp_local_Z = NULL;
float *fp_nram_pts_in_flag = NULL;
if (sizeof(T) == sizeof(float)) {
X = (char *)((float *)data_nram);
Y = (char *)((float *)data_nram + nram_pts_num);
Z = (char *)((float *)data_nram + nram_pts_num * 2);
local_X = (char *)((float *)data_nram + nram_pts_num * 3);
local_Y = (char *)((float *)data_nram + nram_pts_num * 4);
local_Z = (char *)((float *)data_nram + nram_pts_num * 5);
nram_pts_in_flag = (char *)((float *)data_nram + nram_pts_num * 6);
temp_buffer1 = (float *)data_nram + nram_pts_num * 7;
temp_buffer2 = (float *)data_nram + nram_pts_num * 8;
temp_buffer3 = (float *)data_nram + nram_pts_num * 9;
temp_buffer4 = (float *)data_nram + nram_pts_num * 10;
temp_buffer5 = (float *)data_nram + nram_pts_num * 11;
nram_voxel_offset = (float *)data_nram + nram_pts_num * 12;
nram_pts_idx_seq = (int *)((float *)data_nram + nram_pts_num * 13);
fp_local_X = (float *)local_X;
fp_local_Y = (float *)local_Y;
fp_local_Z = (float *)local_Z;
fp_nram_pts_in_flag = (float *)nram_pts_in_flag;
} else {
X = (char *)((half *)data_nram);
Y = (char *)((half *)data_nram + nram_pts_num);
Z = (char *)((half *)data_nram + nram_pts_num * 2);
local_X = (char *)((half *)data_nram + nram_pts_num * 4);
local_Y = (char *)((half *)data_nram + nram_pts_num * 6);
local_Z = (char *)((half *)data_nram + nram_pts_num * 8);
nram_pts_in_flag = (char *)((half *)data_nram + nram_pts_num * 10);
temp_buffer1 = (float *)((half *)data_nram + nram_pts_num * 11);
temp_buffer2 = (float *)((half *)data_nram + nram_pts_num * 13);
temp_buffer3 = (float *)((half *)data_nram + nram_pts_num * 15);
temp_buffer4 = (float *)((half *)data_nram + nram_pts_num * 17);
temp_buffer5 = (float *)((half *)data_nram + nram_pts_num * 19);
nram_voxel_offset = (float *)((half *)data_nram + nram_pts_num * 21);
nram_pts_idx_seq = (int *)((half *)data_nram + nram_pts_num * 23);
fp_local_X = (float *)((half *)local_X - nram_pts_num);
fp_local_Y = (float *)((half *)local_Y - nram_pts_num);
fp_local_Z = (float *)((half *)local_Z - nram_pts_num);
fp_nram_pts_in_flag = (float *)((half *)nram_pts_in_flag - nram_pts_num);
}
for (int i = 0; i < nram_pts_num; i++) {
nram_pts_idx_seq[i] = i;
}
int nram_pts_loop_times = pts_num / nram_pts_num;
int rem_nram_num = pts_num % nram_pts_num;
for (int roi_index = taskId; roi_index < boxes_num; roi_index += taskDim) {
const T *cur_roi = rois + roi_index * ROI_OFFSET;
T cx = cur_roi[0];
T cy = cur_roi[1];
T cz = cur_roi[2];
T dx = cur_roi[3];
T dy = cur_roi[4];
T dz = cur_roi[5];
T rz = cur_roi[6];
T dx_2 = dx / 2.0;
T dy_2 = dy / 2.0;
T dz_2 = dz / 2.0;
for (int loop_idx = 0; loop_idx <= nram_pts_loop_times; loop_idx++) {
int load_pts_num =
(loop_idx == nram_pts_loop_times) ? rem_nram_num : nram_pts_num;
if (load_pts_num == 0) {
break;
}
int pts_offset_cur_loop = nram_pts_num * loop_idx;
int compute_pts_num = (loop_idx == nram_pts_loop_times)
? PAD_UP(rem_nram_num, ALIGN_NUM)
: nram_pts_num;
// load pts
__memcpy((void *)X, (T *)pts + pts_offset_cur_loop,
load_pts_num * sizeof(T), GDRAM2NRAM);
__memcpy((void *)Y, (T *)pts + pts_num + pts_offset_cur_loop,
load_pts_num * sizeof(T), GDRAM2NRAM);
__memcpy((void *)Z, (T *)pts + pts_num * 2 + pts_offset_cur_loop,
load_pts_num * sizeof(T), GDRAM2NRAM);
// fabs(local_z)
__bang_sub_scalar((T *)local_Z, (T *)Z, (T)cz, compute_pts_num);
__bang_sub_scalar((T *)temp_buffer1, (T *)Z, (T)(cz + dz_2),
compute_pts_num);
__bang_active_abs((T *)temp_buffer1, (T *)temp_buffer1, compute_pts_num);
#if __BANG_ARCH__ >= 322
__bang_le_scalar((T *)nram_pts_in_flag, (T *)temp_buffer1, (T)(dz_2),
compute_pts_num);
#else
__bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dz_2));
__bang_le((T *)nram_pts_in_flag, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
#endif
T cosa = std::cos(-rz);
T sina = std::sin(-rz);
__bang_sub_scalar((T *)temp_buffer3, (T *)X, (T)cx, compute_pts_num);
__bang_sub_scalar((T *)temp_buffer4, (T *)Y, (T)cy, compute_pts_num);
__bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)cosa,
compute_pts_num);
__bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)sina,
compute_pts_num);
// local_x
__bang_sub((T *)local_X, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
// fabs(local_x)
__bang_active_abs((T *)temp_buffer1, (T *)local_X, compute_pts_num);
// fabs(local_x) < dx/2 ? 1 : 0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dx_2),
compute_pts_num);
#else
__bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dx_2));
__bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
#endif
__bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
(T *)temp_buffer1,
compute_pts_num); // flush res
__bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)sina,
compute_pts_num);
__bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)cosa,
compute_pts_num);
// local_y
__bang_add((T *)local_Y, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
// fabs(local_y)
__bang_active_abs((T *)temp_buffer1, (T *)local_Y, compute_pts_num);
// fabs(local_y) < dy/2 ? 1 : 0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dy_2),
compute_pts_num);
#else
__bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dy_2));
__bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
compute_pts_num);
#endif
__bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
(T *)temp_buffer1,
compute_pts_num); // flush res
T x_res = dx / out_x;
T y_res = dy / out_y;
T z_res = dz / out_z;
__bang_add_scalar((T *)local_X, (T *)local_X, (T)(dx_2), compute_pts_num);
__bang_add_scalar((T *)local_Y, (T *)local_Y, (T)(dy_2), compute_pts_num);
// local_Z do not need to add dz/2.0
#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372)
__bang_div((T *)local_X, (T *)local_X, (T)x_res, compute_pts_num);
__bang_div((T *)local_Y, (T *)local_Y, (T)y_res, compute_pts_num);
__bang_div((T *)local_Z, (T *)local_Z, (T)z_res, compute_pts_num);
#else
__bang_mul_scalar((T *)local_X, (T *)local_X, (T)(1 / x_res),
compute_pts_num);
__bang_mul_scalar((T *)local_Y, (T *)local_Y, (T)(1 / y_res),
compute_pts_num);
__bang_mul_scalar((T *)local_Z, (T *)local_Z, (T)(1 / z_res),
compute_pts_num);
#endif
// float = float2int + int2float, half = half2int + int2float
if (sizeof(T) == sizeof(float)) {
#if __BANG_ARCH__ >= 322
__bang_float2int32_tz((int *)temp_buffer1, (float *)local_X,
compute_pts_num, 0);
__bang_float2int32_tz((int *)temp_buffer2, (float *)local_Y,
compute_pts_num, 0);
__bang_float2int32_tz((int *)temp_buffer3, (float *)local_Z,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
compute_pts_num, 0);
#else
convertFloat2Int((int *)temp_buffer1, (float *)temp_buffer2,
(float *)fp_local_X, (float *)temp_buffer3,
compute_pts_num);
convertFloat2Int((int *)temp_buffer2, (float *)temp_buffer3,
(float *)fp_local_Y, (float *)temp_buffer4,
compute_pts_num);
convertFloat2Int((int *)temp_buffer3, (float *)temp_buffer4,
(float *)fp_local_Z, (float *)temp_buffer5,
compute_pts_num);
convertInt2Float((float *)fp_local_X, (float *)temp_buffer4,
(int *)temp_buffer1, (float *)temp_buffer5,
compute_pts_num);
convertInt2Float((float *)fp_local_Y, (float *)temp_buffer4,
(int *)temp_buffer2, (float *)temp_buffer5,
compute_pts_num);
convertInt2Float((float *)fp_local_Z, (float *)temp_buffer4,
(int *)temp_buffer3, (float *)temp_buffer5,
compute_pts_num);
#endif
} else {
__bang_half2float((float *)temp_buffer4, (half *)nram_pts_in_flag,
compute_pts_num);
__bang_move((void *)fp_nram_pts_in_flag, (void *)temp_buffer4,
compute_pts_num * sizeof(float));
#if __BANG_ARCH__ >= 322
__bang_half2int32_tz((int *)temp_buffer1, (half *)local_X,
compute_pts_num, 0);
__bang_half2int32_tz((int *)temp_buffer2, (half *)local_Y,
compute_pts_num, 0);
__bang_half2int32_tz((int *)temp_buffer3, (half *)local_Z,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
compute_pts_num, 0);
__bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
compute_pts_num, 0);
#else
__bang_half2int16_tz((int16_t *)temp_buffer1, (half *)local_X,
compute_pts_num, 0);
__bang_half2int16_tz((int16_t *)temp_buffer2, (half *)local_Y,
compute_pts_num, 0);
__bang_half2int16_tz((int16_t *)temp_buffer3, (half *)local_Z,
compute_pts_num, 0);
__bang_int162float((float *)fp_local_X, (int16_t *)temp_buffer1,
compute_pts_num, 0);
__bang_int162float((float *)fp_local_Y, (int16_t *)temp_buffer2,
compute_pts_num, 0);
__bang_int162float((float *)fp_local_Z, (int16_t *)temp_buffer3,
compute_pts_num, 0);
#endif
}
// process index >= 0
__bang_write_value((float *)temp_buffer4, compute_pts_num, (float)0.0f);
__bang_maxequal((float *)fp_local_X, (float *)fp_local_X,
(float *)temp_buffer4, compute_pts_num);
__bang_maxequal((float *)fp_local_Y, (float *)fp_local_Y,
(float *)temp_buffer4, compute_pts_num);
__bang_maxequal((float *)fp_local_Z, (float *)fp_local_Z,
(float *)temp_buffer4, compute_pts_num);
// process index <= (out_x - 1)
__bang_write_value((float *)temp_buffer5, compute_pts_num,
(float)(out_x - 1));
__bang_minequal((float *)fp_local_X, (float *)fp_local_X,
(float *)temp_buffer5, compute_pts_num);
__bang_write_value((float *)temp_buffer5, compute_pts_num,
(float)(out_y - 1));
__bang_minequal((float *)fp_local_Y, (float *)fp_local_Y,
(float *)temp_buffer5, compute_pts_num);
__bang_write_value((float *)temp_buffer5, compute_pts_num,
(float)(out_z - 1));
__bang_minequal((float *)fp_local_Z, (float *)fp_local_Z,
(float *)temp_buffer5, compute_pts_num);
__bang_mul_scalar((float *)temp_buffer1, (float *)fp_local_X,
(float)(out_y * out_z), compute_pts_num);
__bang_mul_scalar((float *)temp_buffer2, (float *)fp_local_Y,
(float)out_z, compute_pts_num);
__bang_mul_scalar((float *)temp_buffer3, (float *)fp_local_Z, (float)1.0,
compute_pts_num);
__bang_add((float *)nram_voxel_offset, (float *)temp_buffer1,
(float *)temp_buffer2, compute_pts_num);
__bang_add((float *)nram_voxel_offset, (float *)nram_voxel_offset,
(float *)temp_buffer3, compute_pts_num);
__bang_mul_scalar((float *)nram_voxel_offset, (float *)nram_voxel_offset,
(float)max_pts_each_voxel, compute_pts_num);
if (compute_pts_num != load_pts_num) {
__memset_nram((float *)fp_nram_pts_in_flag + load_pts_num,
compute_pts_num - load_pts_num, (float)0.0);
}
__bang_collect((float *)temp_buffer4, (float *)nram_pts_idx_seq,
(float *)fp_nram_pts_in_flag, compute_pts_num);
int pts_num_in_cur_roi =
(int)__bang_count((float *)fp_nram_pts_in_flag, compute_pts_num);
int *pts_idx_cur_voxels =
(int *)pts_idx_of_voxels +
roi_index * out_x * out_y * out_z * max_pts_each_voxel;
for (int idx = 0; idx < pts_num_in_cur_roi; idx++) {
int cur_pts_idx = *((int *)temp_buffer4 + idx);
int offset = (int)(*((float *)nram_voxel_offset + cur_pts_idx));
int cnt = pts_idx_cur_voxels[offset];
if (cnt < max_pts_each_voxel - 1) {
pts_idx_cur_voxels[offset + cnt + 1] =
cur_pts_idx + loop_idx * nram_pts_num;
pts_idx_cur_voxels[offset]++;
}
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiawarePool3dForward(
const int pool_method, const int boxes_num, const int pts_num,
const int channels, const int max_pts_each_voxel, const int out_x,
const int out_y, const int out_z, const T *pts_feature,
const int *pts_idx_of_voxels, T *pooled_features, int *argmax) {
// params (T)pts_feature: (channels, pts_num)
// params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
// max_pts_each_voxel) params (int)argmax: (boxes_num, out_x, out_y, out_z,
// channels) params (T)pooled_features: (boxes_num, out_x, out_y, out_z,
// channels)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int align_num = NFU_ALIGN_SIZE / sizeof(T);
int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
int nram_channels_limit =
PAD_DOWN((MAX_NRAM_SIZE - 128 -
align_max_pts_each_voxel * (sizeof(int) + sizeof(T))) /
((align_max_pts_each_voxel + 1) * sizeof(T) + sizeof(int)),
align_num);
int *nram_pts_idx_cur_voxel = (int *)data_nram;
// nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
T *nram_max_pts_feature_tmp =
(T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
// nram_max_pts_feature_tmp [align_max_pts_each_voxel]
T *nram_pts_feature_in_voxel =
((T *)nram_max_pts_feature_tmp + align_max_pts_each_voxel);
// nram_pts_feature_in_voxel [nram_channels_limit, align_max_pts_each_voxel]
T *nram_pooled_features_cur_voxel =
((T *)nram_pts_feature_in_voxel +
nram_channels_limit * align_max_pts_each_voxel);
// nram_pooled_features_cur_voxel [nram_channels_limit]
int *nram_argmax_cur_voxel =
(int *)((T *)nram_pooled_features_cur_voxel + nram_channels_limit);
// nram_argmax_cur_voxel [nram_channels_limit]
char *one_pooled_feature =
(char *)((int *)nram_argmax_cur_voxel + nram_channels_limit);
// one_pooled_feature [128]
int channels_loop_times = channels / nram_channels_limit;
int rem_channels = channels % nram_channels_limit;
for (int voxel_index = taskId;
voxel_index < boxes_num * out_x * out_y * out_z;
voxel_index += taskDim) {
int *pts_idx_cur_voxels =
(int *)pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
__memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxels,
max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
int pts_num_cur_voxel = nram_pts_idx_cur_voxel[0];
if (pts_num_cur_voxel == 0) {
continue;
}
for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
channels_loop_idx++) {
int actual_channels_num = (channels_loop_idx == channels_loop_times)
? rem_channels
: nram_channels_limit;
if (actual_channels_num == 0) {
break;
}
int channels_offset = nram_channels_limit * channels_loop_idx;
#if ((__BANG_ARCH__ >= 200) && (__BANG_ARCH__ < 300))
int compute_channels_num = (channels_loop_idx == channels_loop_times)
? PAD_UP(rem_channels, align_num)
: nram_channels_limit;
if (pool_method == 0) {
__bang_write_value((void *)nram_pts_feature_in_voxel,
compute_channels_num * align_max_pts_each_voxel,
(T)-INFINITY);
}
#endif
T *pts_feature_cur_loop = (T *)pts_feature + channels_offset * pts_num;
for (int idx = 0; idx < pts_num_cur_voxel; idx++) {
__memcpy((T *)nram_pts_feature_in_voxel + idx,
(T *)pts_feature_cur_loop + nram_pts_idx_cur_voxel[idx + 1],
sizeof(T), GDRAM2NRAM, align_max_pts_each_voxel * sizeof(T),
pts_num * sizeof(T), actual_channels_num - 1);
}
for (int channel_idx = 0; channel_idx < actual_channels_num;
channel_idx++) {
if (pool_method == 0) {
#if __BANG_ARCH__ >= 322
__bang_argmax((T *)one_pooled_feature,
(T *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel,
pts_num_cur_voxel);
T max_val = ((T *)one_pooled_feature)[0];
int max_idx = (int)(*(uint32_t *)((T *)one_pooled_feature + 1));
nram_pooled_features_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? 0 : max_val;
nram_argmax_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
#else
// __bang_max need align num on mlu200 series
if (sizeof(T) == sizeof(float)) {
__bang_max((float *)one_pooled_feature,
(float *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel,
align_max_pts_each_voxel);
float max_val = ((float *)one_pooled_feature)[0];
__bang_write_value((void *)nram_max_pts_feature_tmp,
align_max_pts_each_voxel, (float)max_val);
__bang_eq((float *)nram_max_pts_feature_tmp,
(float *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel,
(float *)nram_max_pts_feature_tmp,
align_max_pts_each_voxel);
int max_idx = (int)__bang_findfirst1(
(float *)nram_max_pts_feature_tmp, align_max_pts_each_voxel);
nram_pooled_features_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? 0 : max_val;
nram_argmax_cur_voxel[channel_idx] =
(max_val == -INFINITY) ? -1
: nram_pts_idx_cur_voxel[max_idx + 1];
} else {
int max_idx = -1;
float max_val = -INFINITY;
for (int k = 0; k < pts_num_cur_voxel; k++) {
float pts_feature_cur_channel = __half2float_rd(
*((half *)nram_pts_feature_in_voxel +
channel_idx * align_max_pts_each_voxel + k));
if (pts_feature_cur_channel > max_val) {
max_val = pts_feature_cur_channel;
max_idx = k;
}
}
nram_pooled_features_cur_voxel[channel_idx] =
(max_idx == -1) ? 0 : max_val;
nram_argmax_cur_voxel[channel_idx] =
(max_idx == -1) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
}
#endif
} else if (pool_method == 1) {
float sum_val_cur_channel = 0;
for (int k = 0; k < pts_num_cur_voxel; k++) {
sum_val_cur_channel += static_cast<float>(
((T *)nram_pts_feature_in_voxel)[channel_idx *
align_max_pts_each_voxel +
k]);
}
nram_pooled_features_cur_voxel[channel_idx] =
(T)(sum_val_cur_channel / pts_num_cur_voxel);
}
}
// store
__memcpy((T *)pooled_features + voxel_index * channels + channels_offset,
(void *)nram_pooled_features_cur_voxel,
actual_channels_num * sizeof(T), NRAM2GDRAM);
if (pool_method == 0) {
__memcpy((int *)argmax + voxel_index * channels + channels_offset,
(void *)nram_argmax_cur_voxel,
actual_channels_num * sizeof(int), NRAM2GDRAM);
}
}
}
}
void KernelPtsIdxOfVoxels(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const int pool_method, const int boxes_num,
const int pts_num, const int max_pts_each_voxel,
const int out_x, const int out_y, const int out_z,
const void *rois, const void *pts,
int *pts_idx_of_voxels) {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelPtsIdxOfVoxels<float><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
out_z, (float *)rois, (float *)pts, (int *)pts_idx_of_voxels);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelPtsIdxOfVoxels<half><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
out_z, (half *)rois, (half *)pts, (int *)pts_idx_of_voxels);
}; break;
default: {
break;
}
}
}
void KernelRoiawarePool3dForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
const int pts_num, const int channels, const int max_pts_each_voxel,
const int out_x, const int out_y, const int out_z, const void *pts_feature,
const int *pts_idx_of_voxels, void *pooled_features, int *argmax) {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelRoiawarePool3dForward<float><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
out_y, out_z, (float *)pts_feature, (int *)pts_idx_of_voxels,
(float *)pooled_features, (int *)argmax);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiawarePool3dForward<half><<<k_dim, k_type, queue>>>(
pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
out_y, out_z, (half *)pts_feature, (int *)pts_idx_of_voxels,
(half *)pooled_features, (int *)argmax);
}; break;
default: {
break;
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiawareMaxPool3dBackward(
const int boxes_num, const int out_x, const int out_y, const int out_z,
const int channels, const int *argmax, const T *grad_out, T *grad_in) {
// params (int)argmax: (boxes_num, out_x, out_y, out_z, channels)
// params (T)grad_out: (boxes_num, out_x, out_y, out_z, channels)
// params (T)grad_in: (pts_num, channels)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int nram_channels_limit =
(MAX_NRAM_SIZE - sizeof(T) * 1) / (sizeof(T) + sizeof(int));
int *nram_argmax_cur_loop = (int *)data_nram;
// nram_argmax_cur_loop [nram_channels_limit]
T *nram_grad_out_cur_loop =
(T *)((int *)nram_argmax_cur_loop + nram_channels_limit);
// nram_grad_out_cur_loop [nram_channels_limit]
T *nram_grad_in_cur_channel =
(T *)nram_grad_out_cur_loop + nram_channels_limit;
// nram_grad_in_cur_channel [1]
int channels_loop_times = channels / nram_channels_limit;
int rem_channels = channels % nram_channels_limit;
int voxels_num = boxes_num * out_x * out_y * out_z;
for (int voxel_index = taskId; voxel_index < voxels_num;
voxel_index += taskDim) {
const int *argmax_cur_voxel = argmax + voxel_index * channels;
const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
channels_loop_idx++) {
int actual_channels_num = (channels_loop_idx == channels_loop_times)
? rem_channels
: nram_channels_limit;
if (actual_channels_num == 0) {
break;
}
const int *argmax_cur_loop =
argmax_cur_voxel + nram_channels_limit * channels_loop_idx;
const T *grad_out_cur_loop =
grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
__memcpy((void *)nram_argmax_cur_loop, (void *)argmax_cur_loop,
actual_channels_num * sizeof(int), GDRAM2NRAM);
__memcpy((void *)nram_grad_out_cur_loop, (void *)grad_out_cur_loop,
actual_channels_num * sizeof(T), GDRAM2NRAM);
for (int channel_idx = 0; channel_idx < actual_channels_num;
channel_idx++) {
int *nram_argmax_cur_channel = nram_argmax_cur_loop + channel_idx;
T *nram_grad_out_cur_channel = nram_grad_out_cur_loop + channel_idx;
if (nram_argmax_cur_channel[0] == -1) {
continue;
}
T *grad_in_cur_channel =
grad_in + nram_argmax_cur_channel[0] * channels +
nram_channels_limit * channels_loop_idx + channel_idx;
__bang_atomic_add((T *)nram_grad_in_cur_channel,
(T *)grad_in_cur_channel,
(T *)(nram_grad_out_cur_channel), 1);
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiawareAvgPool3dBackward(
const int boxes_num, const int out_x, const int out_y, const int out_z,
const int channels, const int max_pts_each_voxel,
const int *pts_idx_of_voxels, const T *grad_out, T *grad_in) {
// params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
// max_pts_each_voxel) params (T)grad_out: (boxes_num, out_x, out_y, out_z,
// channels) params (T)grad_in: (pts_num, channels)
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
int align_num = NFU_ALIGN_SIZE / sizeof(T);
int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
int nram_channels_limit = PAD_DOWN(
(MAX_NRAM_SIZE - align_max_pts_each_voxel * sizeof(int)) / 2 / sizeof(T),
align_num);
int *nram_pts_idx_cur_voxel = (int *)data_nram;
// nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
T *nram_grad_out_cur_loop =
(T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
// nram_grad_out_cur_loop [nram_channels_limit]
T *nram_grad_in_cur_loop = (T *)nram_grad_out_cur_loop + nram_channels_limit;
// nram_grad_in_cur_loop [nram_channels_limit]
int channels_loop_times = channels / nram_channels_limit;
int rem_channels = channels % nram_channels_limit;
int voxels_num = boxes_num * out_x * out_y * out_z;
for (int voxel_index = taskId; voxel_index < voxels_num;
voxel_index += taskDim) {
const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
const int *pts_idx_cur_voxel =
pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
__memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxel,
max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
int total_pts_of_voxel = nram_pts_idx_cur_voxel[0];
if (total_pts_of_voxel <= 0) {
continue;
}
float cur_grad = 1.0 / ((float)total_pts_of_voxel);
for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
channels_loop_idx++) {
int actual_channels_num = (channels_loop_idx == channels_loop_times)
? rem_channels
: nram_channels_limit;
if (actual_channels_num == 0) {
break;
}
const T *grad_out_cur_loop =
grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
__memcpy((void *)nram_grad_in_cur_loop, (void *)grad_out_cur_loop,
actual_channels_num * sizeof(T), GDRAM2NRAM);
int align_actual_channels_num = PAD_UP(actual_channels_num, align_num);
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)nram_grad_out_cur_loop,
(half *)nram_grad_in_cur_loop,
align_actual_channels_num);
__bang_mul_scalar((float *)nram_grad_out_cur_loop,
(float *)nram_grad_out_cur_loop, (float)cur_grad,
align_actual_channels_num);
convertFloat2half((half *)nram_grad_out_cur_loop,
(float *)nram_grad_out_cur_loop,
align_actual_channels_num);
} else {
__bang_mul_scalar((float *)nram_grad_out_cur_loop,
(float *)nram_grad_in_cur_loop, (float)cur_grad,
align_actual_channels_num);
}
for (int k = 1; k <= total_pts_of_voxel; k++) {
T *grad_in_cur_loop = grad_in + nram_pts_idx_cur_voxel[k] * channels +
nram_channels_limit * channels_loop_idx;
__bang_atomic_add((T *)nram_grad_in_cur_loop, (T *)grad_in_cur_loop,
(T *)nram_grad_out_cur_loop, actual_channels_num);
}
}
}
}
void KernelRoiawarePool3dBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
const int out_x, const int out_y, const int out_z, const int channels,
const int max_pts_each_voxel, const int *pts_idx_of_voxels,
const int *argmax, const void *grad_out, void *grad_in) {
if (pool_method == 0) {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelRoiawareMaxPool3dBackward<float>
<<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
(int *)argmax, (float *)grad_out,
(float *)grad_in);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiawareMaxPool3dBackward<half>
<<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
(int *)argmax, (half *)grad_out,
(half *)grad_in);
}; break;
default: {
break;
}
}
} else {
switch (d_type) {
case CNRT_FLOAT32: {
MLUUnion1KernelRoiawareAvgPool3dBackward<float>
<<<k_dim, k_type, queue>>>(
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
(int *)pts_idx_of_voxels, (float *)grad_out, (float *)grad_in);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiawareAvgPool3dBackward<half>
<<<k_dim, k_type, queue>>>(
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
(int *)pts_idx_of_voxels, (half *)grad_out, (half *)grad_in);
}; break;
default: {
break;
}
}
}
}
mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
/*************************************************************************
*
* NRAM partition:
* | boxes3d | ping points + pong points | aux_a ~ aux_f |
* | 7 * sizeof(T) | 6 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
*
*************************************************************************/
#define TWELVE_SPLIT 12
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
const size_t deal_num,
T *x,
T *y,
T *z,
T *auxiliary_a,
T *auxiliary_b,
T *auxiliary_c,
T *auxiliary_d,
T *auxiliary_e,
T *auxiliary_f,
T *pts_assign) {
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
T cx = boxes3d[0];
T cy = boxes3d[1];
T cz = boxes3d[2];
T dx = boxes3d[3];
T dy = boxes3d[4];
T dz = boxes3d[5];
T rz = boxes3d[6];
// shift to the center since cz in box3d is the bottom center
cz += 0.5 * dz;
T cosa = (T)std::cos(-rz);
T sina = (T)std::sin(-rz);
// x - cx
__bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
// y - cy
__bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
// z - cz
__bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
// |z - cz|
__bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// |z - cz| > dz / 2.0
#if __BANG_ARCH__ >= 322
__bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
#else
__bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
__bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
#endif
// !(|z - cz| > dz / 2.0)
__bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// (x - cx) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
// (y - cy) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
// local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
__bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
// |local_x|
__bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
// |local_x| < dx / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
#else
__bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
__bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
#endif
// (x - cx) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
// (y - cy) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
// local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
__bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
// |local_y|
__bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
// |local_y| < dy / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
#else
__bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
__bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
#endif
// pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
__bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
__bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
}
template <typename T>
__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (*cnt >= sampled_pts_num) {
return;
}
checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
(T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
(T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (select_num == 0) {
return;
}
int sampled_pts_num_rem = sampled_pts_num - *cnt;
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(
pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
*cnt += select_num;
}
template <typename T>
__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
const size_t auxiliary_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (*cnt >= sampled_pts_num) {
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
return;
}
checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
(T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
(T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (*cnt + select_num == 0) {
// pooled_empty_flag_gdram set 1
*((int *)auxiliary_a) = 1;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
// pooled_features_gdram set 0
int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
// use auxiliary_a to auxiliary_f
__bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
if (repeat > 0) {
__memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram +
box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
repeat * auxiliary_num_deal * 6 * sizeof(T),
auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
}
return;
}
if (select_num > 0) {
int sampled_pts_num_rem = sampled_pts_num - *cnt;
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
}
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
*cnt += select_num;
if (*cnt < sampled_pts_num) {
// duplicate same points for sampling
int repeat = sampled_pts_num / (*cnt) - 1;
int rem = sampled_pts_num % (*cnt);
if (repeat > 0) {
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
(*cnt) * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
(*cnt) * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(
pooled_features_gdram +
(box_idx * sampled_pts_num + (repeat + 1) * (*cnt)) * (3 + feature_in_len) *
sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
if (coreId == 0x80) {
return;
}
size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
size_t boxes_rem = (batch_size * boxes_num) % taskDim;
// calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
int32_t batch_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) / boxes_num :
(taskId * boxes_per_core + boxes_rem) / boxes_num;
int32_t batch_end = taskId < boxes_rem ?
((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
size_t last_batch_box_end = taskId < boxes_rem ?
(taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
// points_xyz : [3, B, N]
const char *points_x_gdram = points_xyz_gdram;
const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
size_t boxes3d_size = PAD_UP(7, NFU_ALIGN_SIZE) * sizeof(T);
size_t span_num_deal = PAD_DOWN(MAX_NRAM_SIZE / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
size_t align_num = NFU_ALIGN_SIZE;
int32_t repeat = pts_num / span_num_deal;
size_t rem = pts_num % span_num_deal;
size_t align_rem = CEIL_ALIGN(rem, align_num);
char *boxes3d = nram_buffer;
char *ping_points_x = nram_buffer + boxes3d_size;
char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
size_t span_load_input1_size = span_num_deal * sizeof(T);
size_t span_load_input2_size = span_num_deal * sizeof(T);
size_t span_load_input3_size = span_num_deal * sizeof(T);
size_t span_load_input4_size = span_num_deal * sizeof(T);
int cnt = 0;
for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
const char *point_features_start =
point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
char *pooled_features_start =
pooled_features_gdram +
(bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
__memcpy_async(boxes3d,
boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T) + box_idx * 7 * sizeof(T),
7 * sizeof(T), GDRAM2NRAM);
cnt = 0;
if (repeat > 0) {
__memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
__memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
__memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
__asm__ volatile("sync;");
}
for (int i = 0; i < repeat - 1; i++) {
__memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
GDRAM2NRAM);
__memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
GDRAM2NRAM);
__memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
GDRAM2NRAM);
computeStoreRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + (i % 2) * ping_pong_gap,
ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
__asm__ volatile("sync;");
}
if (rem > 0) {
if (sizeof(T) == sizeof(float)) {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
} else {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
}
__memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
points_x_start + repeat * span_load_input1_size, rem * sizeof(T),
GDRAM2NRAM);
__memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
points_y_start + repeat * span_load_input2_size, rem * sizeof(T),
GDRAM2NRAM);
__memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
points_z_start + repeat * span_load_input3_size, rem * sizeof(T),
GDRAM2NRAM);
}
if (repeat > 0 && rem > 0) {
computeStoreRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
} else if (repeat > 0 && rem == 0) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
if (rem > 0) {
__asm__ volatile("sync;");
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
ping_points_y + (repeat % 2) * ping_pong_gap,
ping_points_z + (repeat % 2) * ping_pong_gap,
point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
}
}
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
void KernelRoiPointPool3dLargeBoxesNumForward(cnrtDim3_t k_dim,
cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const void *points_xyz,
const void *boxes3d,
const void *point_features,
void *pooled_features,
int *pooled_empty_flag) {
switch (d_type) {
default: { break; }
case CNRT_FLOAT32: {
MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
}
}
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
/**************************************************************************************
*
* NRAM partition:
* | boxes3d | cnt |
* | boxes_num * 7 * sizeof(T) | boxes_num * sizeof(int) |
*
* | ping points | pong points | aux_a ~ aux_f |
* | 3 * deal_num * sizeof(T) | 3 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
*
***************************************************************************************/
#define TWELVE_SPLIT 12
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
const size_t deal_num,
T *x,
T *y,
T *z,
T *auxiliary_a,
T *auxiliary_b,
T *auxiliary_c,
T *auxiliary_d,
T *auxiliary_e,
T *auxiliary_f,
T *pts_assign) {
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
T cx = boxes3d[0];
T cy = boxes3d[1];
T cz = boxes3d[2];
T dx = boxes3d[3];
T dy = boxes3d[4];
T dz = boxes3d[5];
T rz = boxes3d[6];
// shift to the center since cz in box3d is the bottom center
cz += 0.5 * dz;
T cosa = (T)std::cos(-rz);
T sina = (T)std::sin(-rz);
// x - cx
__bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
// y - cy
__bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
// z - cz
__bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
// |z - cz|
__bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// |z - cz| > dz / 2.0
#if __BANG_ARCH__ >= 322
__bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
#else
__bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
__bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
#endif
// !(|z - cz| > dz / 2.0)
__bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// (x - cx) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
// (y - cy) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
// local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
__bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
// |local_x|
__bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
// |local_x| < dx / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
#else
__bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
__bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
#endif
// (x - cx) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
// (y - cy) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
// local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
__bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
// |local_y|
__bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
// |local_y| < dy / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
#else
__bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
__bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
#endif
// pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
__bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
__bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
}
template <typename T>
__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (cnt[box_idx] >= sampled_pts_num) {
return;
}
checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
(T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
(T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
(T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (select_num == 0) {
return;
}
int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
cnt[box_idx] += select_num;
}
template <typename T>
__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
const size_t auxiliary_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (cnt[box_idx] >= sampled_pts_num) {
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
return;
}
checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
(T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
(T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
(T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (cnt[box_idx] + select_num == 0) {
// pooled_empty_flag_gdram set 1
*((int *)auxiliary_a) = 1;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
// pooled_features_gdram set 0
int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
// use auxiliary_a to auxiliary_f
__bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
if (repeat > 0) {
__memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram +
box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
repeat * auxiliary_num_deal * 6 * sizeof(T),
auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
}
return;
}
if (select_num > 0) {
int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
}
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
cnt[box_idx] += select_num;
if (cnt[box_idx] < sampled_pts_num) {
// duplicate same points for sampling
int repeat = sampled_pts_num / cnt[box_idx] - 1;
int rem = sampled_pts_num % cnt[box_idx];
if (repeat > 0) {
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
cnt[box_idx] * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
cnt[box_idx] * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + (repeat + 1) * cnt[box_idx]) *
(3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiPointPool3dForward(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
if (coreId == 0x80) {
return;
}
size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
size_t boxes_rem = (batch_size * boxes_num) % taskDim;
// calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
int32_t batch_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) / boxes_num :
(taskId * boxes_per_core + boxes_rem) / boxes_num;
int32_t batch_end = taskId < boxes_rem ?
((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
size_t last_batch_box_end = taskId < boxes_rem ?
(taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
// points_xyz : [3, B, N]
const char *points_x_gdram = points_xyz_gdram;
const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
size_t boxes3d_size = PAD_UP(boxes_num * 7, NFU_ALIGN_SIZE) * sizeof(T);
size_t cnt_size = PAD_UP(boxes_num, NFU_ALIGN_SIZE) * sizeof(int);
size_t span_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - boxes3d_size - cnt_size) / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
size_t align_num = NFU_ALIGN_SIZE;
int32_t repeat = pts_num / span_num_deal;
size_t rem = pts_num % span_num_deal;
size_t align_rem = CEIL_ALIGN(rem, align_num);
char *boxes3d = nram_buffer;
char *cnt = nram_buffer + boxes3d_size;
char *ping_points_x = cnt + cnt_size;
char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
size_t span_load_input1_size = span_num_deal * sizeof(T);
size_t span_load_input2_size = span_num_deal * sizeof(T);
size_t span_load_input3_size = span_num_deal * sizeof(T);
size_t span_load_input4_size = span_num_deal * sizeof(T);
for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
__memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T),
boxes_num * 7 * sizeof(T), GDRAM2NRAM);
__bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE));
const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
const char *point_features_start =
point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
char *pooled_features_start =
pooled_features_gdram +
(bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
if (repeat > 0) {
__memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
__memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
__memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
__asm__ volatile("sync;");
}
for (int i = 0; i < repeat - 1; i++) {
__memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
GDRAM2NRAM);
__memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
GDRAM2NRAM);
__memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
GDRAM2NRAM);
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + (i % 2) * ping_pong_gap,
ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
}
__asm__ volatile("sync;");
}
if (rem > 0) {
if (sizeof(T) == sizeof(float)) {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
} else {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
}
__memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
points_x_start + repeat * span_load_input1_size, rem * sizeof(T), GDRAM2NRAM);
__memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
points_y_start + repeat * span_load_input2_size, rem * sizeof(T), GDRAM2NRAM);
__memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
points_z_start + repeat * span_load_input3_size, rem * sizeof(T), GDRAM2NRAM);
}
if (repeat > 0 && rem > 0) {
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
}
} else if (repeat > 0 && rem == 0) {
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
if (rem > 0) {
__asm__ volatile("sync;");
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
ping_points_y + (repeat % 2) * ping_pong_gap,
ping_points_z + (repeat % 2) * ping_pong_gap,
point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
}
}
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<float>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<half>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
void KernelRoiPointPool3dForward(cnrtDim3_t k_dim,
cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const void *points_xyz,
const void *boxes3d,
const void *point_features,
void *pooled_features,
int *pooled_empty_flag) {
switch (d_type) {
default: { break; }
case CNRT_FLOAT32: {
MLUUnion1KernelRoiPointPool3dForward<float><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiPointPool3dForward<half><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
}
}
mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include <algorithm>
__nram__ char nram_buffer[MAX_NRAM_SIZE];
#if __BANG_ARCH__ >= 322
/**
* returns the index of ret, which is stored at the 1st position of the `ret`,
* used after bang_min
*/
__mlu_func__ uint32_t getIndice(half *ret) {
uint32_t indice = *((uint32_t *)((uint16_t *)ret + 1));
return indice;
}
/**
* returns the index of ret, which is stored at the 1st position of the `ret`,
* used after bang_min
*/
__mlu_func__ uint32_t getIndice(float *ret) {
uint32_t indice = ((uint32_t *)ret)[1];
return indice;
}
#endif
template <typename T>
__mlu_func__ void auxArgmin(T *nram_dst, T *nram_src, const int num_deal,
T *value, int *index) {
__bang_min(nram_dst, nram_src, num_deal);
*value = nram_dst[0];
__bang_write_value(nram_dst, num_deal, *value);
__bang_eq(nram_dst, nram_src, nram_dst, num_deal);
__bang_findfirst1((uint32_t *)nram_dst, nram_dst, num_deal);
*index = *((int *)nram_dst);
}
template <typename T>
__mlu_func__ void auxFuncFind3Min(T *nram_aux_a, const int auxa_offset,
int *nram_aux_b, const int auxb_offset,
T *nram_dest, T *nram_aux_sort_a,
int *nram_aux_sort_b, const int deal_offset) {
__bang_write_value(nram_aux_sort_a, auxa_offset, (T)(INFINITY));
__bang_write_value(nram_aux_sort_b, auxb_offset, (int)0);
int index = 0;
for (int i = 0; i < 3; i++) {
#if __BANG_ARCH__ >= 322
__bang_argmin(nram_dest, nram_aux_a, auxa_offset);
nram_aux_sort_a[i] = nram_dest[0];
index = getIndice(nram_dest);
#else
T value = 0;
auxArgmin(nram_dest, nram_aux_a, auxa_offset, &value, &index);
nram_aux_sort_a[i] = value;
#endif
nram_aux_sort_b[i] = nram_aux_b[index];
__memset_nram(nram_aux_a + index, 1, (T)(INFINITY));
}
__memcpy((char *)nram_aux_a, (char *)nram_aux_sort_a, auxa_offset * sizeof(T),
NRAM2NRAM);
__memcpy((char *)nram_aux_b, (char *)nram_aux_sort_b,
auxb_offset * sizeof(int), NRAM2NRAM);
}
template <typename T>
__mlu_func__ void auxFuncSort(T *nram_aux_a, const int auxa_offset,
int *nram_aux_b, const int auxb_offset,
T *nram_dest, T *nram_help_value,
int *nram_help_idx, const int num_deal,
const int deal_offset) {
for (int k = 0; k < num_deal; ++k) {
auxFuncFind3Min(nram_aux_a + k * auxa_offset, auxa_offset,
nram_aux_b + k * auxb_offset, auxb_offset, nram_dest,
nram_help_value, nram_help_idx, deal_offset);
}
}
template <typename T>
__mlu_func__ void auxFuncNN(
size_t *output_aux_sort_a_gap, size_t *output_aux_sort_b_gap,
size_t *output_aux_dest_gap, size_t *output_unknown_gap,
size_t *output_known_gap, size_t *output_dist_gap, size_t *auxillary_a_gap,
size_t *auxillary_b_gap, size_t *known_num_deal, size_t *unknown_num_deal,
size_t *align_num, size_t *auxa_offset, size_t *auxb_offset) {
/*
* nram partition:
* |-NFU_ALIGN_SIZE-|-2*NFU_ALIGN_SIZE-|-X*3*sizeof(T)-|
* space: | aux_sort_a | aux_sort_b | nram_unknown |
*
* | ------ (Y * 7 *sizeof(T)) ---------------- |
* | nram_known | nram_dist | nram_dest |
*
* | -X * NFU_ALIGN_SIZE ---|---X * 2 * NFU_ALIGN_SIZE-|
* | output_dist(aux_a) | output_dist(aux_b) |
* 200 series
* X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (2/3) / (3 * sizeof(T) + 3 *
* NFU_ALIGN_SIZE)
* Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (1/3) / (7 * sizeof(T))
* 300 series
* X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (4/5) / (3 *
* sizeof(T) + 3 * NFU_ALIGN_SIZE)
* Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) *
* (1/5) / (7 * sizeof(T))
*
*/
*align_num = NFU_ALIGN_SIZE / sizeof(T);
*auxa_offset = NFU_ALIGN_SIZE / sizeof(T);
*auxb_offset = 2 * NFU_ALIGN_SIZE / sizeof(int);
#if __BANG_ARCH__ >= 322
*known_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 / (7 * sizeof(T)), *align_num);
*unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 * 4 /
(3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
*align_num);
#else
*known_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 / (7 * sizeof(T)), *align_num);
*unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 * 2 /
(3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
*align_num);
#endif
*output_aux_sort_a_gap = 0;
*output_aux_sort_b_gap = *output_aux_sort_a_gap + NFU_ALIGN_SIZE;
*output_aux_dest_gap = *output_aux_sort_b_gap + 2 * NFU_ALIGN_SIZE;
*output_unknown_gap = *output_aux_dest_gap + *known_num_deal * sizeof(T);
*output_known_gap = *output_unknown_gap + *unknown_num_deal * 3 * sizeof(T);
*output_dist_gap = *output_known_gap + *known_num_deal * 3 * sizeof(T);
*auxillary_a_gap = *output_dist_gap + *known_num_deal * 3 * sizeof(T);
*auxillary_b_gap = *auxillary_a_gap + *unknown_num_deal * NFU_ALIGN_SIZE;
}
#if __BANG_ARCH__ >= 322
template <typename T>
__mlu_func__ bool containNanInf(T *nram_unknown) {
if (std::isnan(nram_unknown[0]) || std::isnan(nram_unknown[1]) ||
std::isnan(nram_unknown[2]) || std::isinf(nram_unknown[0]) ||
std::isinf(nram_unknown[1]) || std::isinf(nram_unknown[2]))
return true;
else
return false;
}
#endif
template <typename T>
__mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist,
T *nram_dest, T *nram_aux_a,
T *nram_aux_sort_a, int *nram_aux_b,
int *nram_aux_sort_b, const int known_num_deal,
const int known_seg_num, const int deal_offset,
const int known_count,
const int known_count_align) {
__bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
#if __BANG_ARCH__ >= 322
if (!containNanInf(nram_unknown)) {
#endif
// x1 - x2
__bang_sub_scalar(nram_dist, nram_known, nram_unknown[0],
known_count_align);
// y1 - y2
__bang_sub_scalar(nram_dist + known_count_align,
nram_known + known_count_align, nram_unknown[1],
known_count_align);
// z1 - z2
__bang_sub_scalar(nram_dist + 2 * known_count_align,
nram_known + 2 * known_count_align, nram_unknown[2],
known_count_align);
__bang_square(nram_dist, nram_dist, 3 * known_count_align);
__bang_add(nram_dist, nram_dist, nram_dist + known_count_align,
known_count_align);
__bang_add(nram_dist, nram_dist, nram_dist + 2 * known_count_align,
known_count_align);
#if __BANG_ARCH__ >= 322
}
#endif
int index = 0;
for (int i = 0; i < 3; i++) {
#if __BANG_ARCH__ >= 322
__bang_argmin(nram_dest, nram_dist, known_count_align);
nram_aux_a[i + deal_offset] = nram_dest[0];
index = getIndice(nram_dest);
#else
T value = 0;
auxArgmin(nram_dest, nram_dist, known_count_align, &value, &index);
nram_aux_a[i + deal_offset] = value;
#endif
nram_aux_b[i + deal_offset] = index + known_seg_num * known_num_deal;
__memset_nram(nram_dist + index, 1, (T)(INFINITY));
}
}
template <typename T>
__mlu_func__ void loadTransposedKnownTensor(
char *nram_known, char *nram_dist, const char *known_gdram,
const int known_num_deal, const int batch_id, const int m,
const int known_seg_num, const int count, const int count_align_num) {
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
#if __BANG_ARCH__ >= 322
__bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
__memcpy(nram_dist,
known_gdram +
(batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
m * sizeof(T), 2);
__bang_minequal((T *)nram_known, (T *)nram_known, (T *)nram_dist,
3 * count_align_num);
#else
__memcpy(nram_known,
known_gdram +
(batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
m * sizeof(T), 2);
#endif
}
template <typename T>
__mlu_func__ void loadUnknownTensor(char *nram_unknown,
const char *unknown_gdram,
const int unknown_num_deal,
const int unknown_seg_num, const int count,
const int count_align_num) {
__memcpy(nram_unknown,
unknown_gdram + unknown_seg_num * unknown_num_deal * 3 * sizeof(T),
count * 3 * sizeof(T), GDRAM2NRAM);
}
template <typename T>
__mlu_func__ void auxProcessSegment(
const int m, const int n, T *nram_unknown, T *nram_known, T *nram_dist,
T *nram_dest, T *known_gdram, T *nram_aux_a, const int auxa_offset,
int *nram_aux_b, const int auxb_offset, T *nram_aux_sort_a,
int *nram_aux_sort_b, const int unknown_num_deal, const int known_num_deal,
const int known_seg_num, const int unknown_seg_num, const int unknown_count,
const int known_count, const int known_count_align, const int start_idx,
int *deal_offset) {
int pre_batch_id = -1;
int cur_batch_id = -1;
pre_batch_id = start_idx / n;
// if aux_a space is not enough, get the first 3 min among aux_a and clear.
if (*deal_offset >= PAD_DOWN(auxa_offset, 3)) {
auxFuncSort(nram_aux_a, auxa_offset, nram_aux_b, auxb_offset, nram_dest,
nram_aux_sort_a, nram_aux_sort_b, unknown_count, *deal_offset);
*deal_offset = 3;
}
// load i'th segment of known batch data.
loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
(char *)known_gdram, known_num_deal,
pre_batch_id, m, known_seg_num, known_count,
known_count_align);
for (int k = 0; k < unknown_count; ++k) {
cur_batch_id = (start_idx + k) / n;
if (cur_batch_id != pre_batch_id) { // if batch id of unknown data changed,
// load corresponding known batch data
pre_batch_id = cur_batch_id;
loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
(char *)known_gdram, known_num_deal,
pre_batch_id, m, known_seg_num, known_count,
known_count_align);
}
computeThreeNN(nram_unknown + 3 * k, nram_known, nram_dist, nram_dest,
nram_aux_a + k * auxa_offset, nram_aux_sort_a,
nram_aux_b + k * auxb_offset, nram_aux_sort_b,
known_num_deal, known_seg_num, *deal_offset, known_count,
known_count_align);
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelThreeNN(const int b, const int n,
const int m, char *unknown_gdram,
char *known_gdram, char *dist2_gdram,
int *idx_gdram) {
if (coreId == 0x80) {
return;
}
size_t output_aux_sort_a_gap = 0, output_aux_sort_b_gap = 0,
output_dest_gap = 0, output_unknown_gap = 0, output_known_gap = 0,
output_dist_gap = 0, auxillary_a_gap = 0, auxillary_b_gap = 0,
known_num_deal = 0, unknown_num_deal = 0, align_num = 0,
auxa_offset = 0, auxb_offset = 0;
auxFuncNN<T>(&output_aux_sort_a_gap, &output_aux_sort_b_gap, &output_dest_gap,
&output_unknown_gap, &output_known_gap, &output_dist_gap,
&auxillary_a_gap, &auxillary_b_gap, &known_num_deal,
&unknown_num_deal, &align_num, &auxa_offset, &auxb_offset);
int num_per_core = b * n / taskDim;
const int core_offset = num_per_core;
char *unknown_gdram_start =
unknown_gdram + taskId * 3 * core_offset * sizeof(T);
char *known_gdram_start = known_gdram;
char *output_dist_start = dist2_gdram + taskId * 3 * core_offset * sizeof(T);
int *output_idx_start = idx_gdram + taskId * 3 * core_offset;
const int rem = (b * n) % taskDim;
if (taskId == taskDim - 1) {
num_per_core += rem;
}
const int unknown_repeat =
num_per_core / unknown_num_deal; // if unknown number is big, process it
// by unknown_repeat times.
const int unknown_rem = num_per_core % unknown_num_deal; // unknown reminder
const int unknown_rem_align = PAD_UP(unknown_rem, align_num);
const int known_repeat =
m / known_num_deal; // if known number is big, process it by
// unknown_repeat times.
const int known_rem = m % known_num_deal; // known reminder
const int known_rem_align = PAD_UP(known_rem, align_num);
char *nram_aux_sort_a = nram_buffer;
int *nram_aux_sort_b = (int *)(nram_buffer + output_aux_sort_b_gap);
char *nram_dest = nram_buffer + output_dest_gap;
char *nram_unknown = nram_buffer + output_unknown_gap;
char *nram_known = nram_buffer + output_known_gap;
char *nram_dist = nram_buffer + output_dist_gap;
char *nram_aux_a = nram_buffer + auxillary_a_gap;
int *nram_aux_b = (int *)(nram_buffer + auxillary_b_gap);
int deal_offset = 0;
int start_idx = -1;
for (int j = 0; j < unknown_repeat;
++j) { // process data within a unknown_repeat
// if unknown need to be process segmentally, use a aux_a and aux_b
// space to find first 3 minimum dist.
__bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
(T)(INFINITY));
__bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal, j,
unknown_num_deal, unknown_num_deal);
deal_offset = 0;
start_idx = taskId * core_offset + j * unknown_num_deal;
for (int i = 0; i < known_repeat;
++i) { // process known data in segmentally.
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, i, j, unknown_num_deal,
known_num_deal, known_num_deal, start_idx, &deal_offset);
deal_offset += 3;
}
if (known_rem > 0) { // process known rem
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, known_repeat, j, unknown_num_deal,
known_rem, known_rem_align, start_idx, &deal_offset);
}
deal_offset += 3;
if (deal_offset > 3) {
auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
(T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, deal_offset);
deal_offset = 0;
}
__memcpy((char *)output_dist_start + j * unknown_num_deal * 3 * sizeof(T),
(char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
auxa_offset * sizeof(T), unknown_num_deal - 1);
__memcpy((char *)output_idx_start + j * unknown_num_deal * 3 * sizeof(int),
(char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
auxb_offset * sizeof(int), unknown_num_deal - 1);
}
if (unknown_rem > 0) { // process unknown rem
deal_offset = 0;
__bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
(T)(INFINITY));
__bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal,
unknown_repeat, unknown_rem, unknown_rem_align);
start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
for (int i = 0; i < known_repeat; ++i) {
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, i, unknown_repeat, unknown_rem,
known_num_deal, known_num_deal, start_idx, &deal_offset);
deal_offset += 3;
}
if (known_rem > 0) {
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
auxProcessSegment<T>(
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_num_deal, known_num_deal, known_repeat, unknown_repeat,
unknown_rem, known_rem, known_rem_align, start_idx, &deal_offset);
deal_offset += 3;
}
if (deal_offset > 3) {
auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
(T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
unknown_rem, deal_offset);
deal_offset = 0;
}
__memcpy((char *)output_dist_start +
unknown_repeat * unknown_num_deal * 3 * sizeof(T),
(char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
auxa_offset * sizeof(T), unknown_rem - 1);
__memcpy((char *)output_idx_start +
unknown_repeat * unknown_num_deal * 3 * sizeof(int),
(char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
auxb_offset * sizeof(int), unknown_rem - 1);
}
}
template __mlu_global__ void MLUUnion1KernelThreeNN<float>(
const int b, const int n, const int m, char *unknown_gdram,
char *known_gdram, char *dist2_gdram, int *idx_gdram);
template __mlu_global__ void MLUUnion1KernelThreeNN<half>(
const int b, const int n, const int m, char *unknown_gdram,
char *known_gdram, char *dist2_gdram, int *idx_gdram);
void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *unknown, const void *known, void *dist2,
int *idx, const int b, const int n, const int m) {
switch (data_type) {
case CNRT_FLOAT16: {
MLUUnion1KernelThreeNN<half><<<k_dim, k_type, queue>>>(
b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
}; break;
case CNRT_FLOAT32: {
MLUUnion1KernelThreeNN<float><<<k_dim, k_type, queue>>>(
b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
}; break;
default: {
break;
}
}
}
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
__nram__ char data_nram[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void mluMultiKernelTinShift(
const T *input, const int *shifts, T *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel) {
for (int cur_channel_index = taskId;
cur_channel_index < batch_size * channel_size;
cur_channel_index += taskDim) {
int n_index = cur_channel_index / channel_size;
int group_id = cur_channel_index % channel_size / group_channel;
int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size;
__bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
} else {
if (t_shift > 0) {
__memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
} else {
__memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
}
}
__asm__ volatile("sync;");
}
}
template <typename T>
__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
const int time_size, const int hw_size,
const int channel_size, const int index,
const int cur_sequence_index,
const int max_length_per_core, T *output) {
for (int cur_index = index; cur_index < index + hw_size;
cur_index += max_length_per_core) {
int memcpy_size = max_length_per_core;
if (cur_index + max_length_per_core > index + hw_size) {
memcpy_size = index + hw_size - cur_index;
}
if (cur_sequence_index - t_shift < 0 ||
cur_sequence_index - t_shift >= time_size) {
__memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
NRAM2GDRAM);
} else {
__memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
memcpy_size * sizeof(T), GDRAM2NRAM);
__memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
NRAM2GDRAM);
}
__asm__ volatile("sync;");
}
}
template <typename T>
__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
const T *input, const int *shifts, T *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const int max_number_hw_per_core, const int max_length_per_core) {
const int tmp_max_number_hw_per_core =
max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
const int loop_time = time_size / tmp_max_number_hw_per_core +
((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
int segmentime_size = tmp_max_number_hw_per_core;
int res_segment = time_size % tmp_max_number_hw_per_core;
for (int cur_segment_index = taskId;
cur_segment_index < loop_time * batch_size * channel_size;
cur_segment_index += taskDim) {
int n_index = cur_segment_index / loop_time / channel_size;
int group_id = cur_segment_index / loop_time % channel_size / group_channel;
int t_shift = shifts[n_index * group_size + group_id];
int index = n_index * time_size * channel_size * hw_size +
(cur_segment_index / loop_time % channel_size) * hw_size +
cur_segment_index % loop_time * segmentime_size * hw_size *
channel_size;
char *dst_gdram2nram = data_nram;
const T *src_gdram2nram = input + index;
int count_gdram2nram = -1;
int count_nram2gdram = -1;
int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
__bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
cur_sequence_index, max_length_per_core, output);
continue;
}
if (abs(t_shift) >= time_size) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
res_segment - 1);
} else {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
}
continue;
}
if (t_shift == 0) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index;
count_gdram2nram = res_segment - 1;
count_nram2gdram = res_segment - 1;
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index;
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
}
} else if (t_shift > 0) {
int first_index_cur_channel =
n_index * time_size * channel_size * hw_size +
(cur_segment_index / loop_time % channel_size) * hw_size;
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
dst_gdram2nram = data_nram;
src_gdram2nram =
input +
(index - t_shift * channel_size * hw_size < first_index_cur_channel
? first_index_cur_channel
: index - t_shift * channel_size * hw_size);
count_gdram2nram = res_segment - 1;
count_nram2gdram = res_segment - 1;
if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
dst_gdram2nram =
data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
}
} else {
if (t_shift >= next_sequence_index) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
continue;
} else if (cur_sequence_index < t_shift &&
t_shift < next_sequence_index) {
dst_gdram2nram =
data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
src_gdram2nram = input + first_index_cur_channel;
count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
count_nram2gdram = segmentime_size - 1;
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index - t_shift * channel_size * hw_size;
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
}
}
} else {
int offset_index = time_size + t_shift;
if (cur_sequence_index >= offset_index) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
res_segment - 1);
continue;
} else {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
continue;
}
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index - t_shift * channel_size * hw_size;
if (cur_sequence_index - t_shift + segmentime_size < time_size) {
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
} else {
count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
count_nram2gdram =
(segmentime_size - 1) < (time_size - cur_sequence_index - 1)
? (segmentime_size - 1)
: (time_size - cur_sequence_index - 1);
}
}
}
__memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
count_gdram2nram);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
count_nram2gdram);
__asm__ volatile("sync;");
}
}
__mlu_entry__ void MLUUnion1KernelTinShift(
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
batch_size, time_size, channel_size, hw_size,
group_size, group_channel);
}; break;
case CNRT_FLOAT32: {
mluMultiKernelTinShift((float *)input, (const int *)shifts,
(float *)output, batch_size, time_size,
channel_size, hw_size, group_size, group_channel);
}; break;
default: { return; }
}
}
__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const int max_number_hw_per_core, const int max_length_per_core,
const cnrtDataType_t data_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
mluMultiKernelTinShiftSplitSequence(
(half *)input, (const int *)shifts, (half *)output, batch_size,
time_size, channel_size, hw_size, group_size, group_channel,
max_number_hw_per_core, max_length_per_core);
}; break;
case CNRT_FLOAT32: {
mluMultiKernelTinShiftSplitSequence(
(float *)input, (const int *)shifts, (float *)output, batch_size,
time_size, channel_size, hw_size, group_size, group_channel,
max_number_hw_per_core, max_length_per_core);
}; break;
default: { return; }
}
}
void KernelTinShiftForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core) {
if (channel_per_core >= 1) {
MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
input, shifts, output, batch_size, time_size, channel_size, hw_size,
group_size, group_channel, data_dtype);
} else {
MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
input, shifts, output, batch_size, time_size, channel_size, hw_size,
group_size, group_channel, max_number_hw_per_core, max_length_per_core,
data_dtype);
}
}
void KernelTinShiftBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *grad_output, const void *shifts, void *grad_input,
const int batch_size, const int time_size, const int channel_size,
const int hw_size, const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core) {
if (channel_per_core >= 1) {
MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
grad_output, shifts, grad_input, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, data_dtype);
} else {
MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
grad_output, shifts, grad_input, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, max_number_hw_per_core,
max_length_per_core, data_dtype);
}
}
mmcv/ops/csrc/common/mps/MPSDevice.h
deleted
100644 → 0
View file @
6f674c7e
// Copyright © 2022 Apple Inc.
// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
#pragma once
#include <ATen/ATen.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef
id
<
MTLDevice
>
MTLDevice_t
;
#else
typedef
void
*
MTLDevice
;
typedef
void
*
MTLDevice_t
;
#endif
using
namespace
std
;
namespace
at
{
namespace
mps
{
//-----------------------------------------------------------------
// MPSDevice
//
// MPSDevice is a singleton class that returns the default device
//-----------------------------------------------------------------
class
TORCH_API
MPSDevice
{
public:
/**
* MPSDevice should not be cloneable.
*/
MPSDevice
(
MPSDevice
&
other
)
=
delete
;
/**
* MPSDevice should not be assignable.
*/
void
operator
=
(
const
MPSDevice
&
)
=
delete
;
/**
* Gets single instance of the Device.
*/
static
MPSDevice
*
getInstance
();
/**
* Returns the single device.
*/
MTLDevice_t
device
()
{
return
_mtl_device
;
}
~
MPSDevice
();
private:
static
MPSDevice
*
_device
;
MTLDevice_t
_mtl_device
;
MPSDevice
();
};
TORCH_API
bool
is_available
();
TORCH_API
at
::
Allocator
*
GetMPSAllocator
(
bool
useSharedAllocator
=
false
);
}
// namespace mps
}
// namespace at
mmcv/ops/csrc/common/mps/MPSLibrary.h
deleted
100644 → 0
View file @
6f674c7e
#ifndef _MPS_LIBRARY_H_
#define _MPS_LIBRARY_H_
#include <string>
#include <unordered_map>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef
id
<
MTLComputePipelineState
>
MTLComputePipelineState_t
;
typedef
id
<
MTLLibrary
>
MTLLibrary_t
;
#else
typedef
void
*
MTLComputePipelineState
;
typedef
void
*
MTLComputePipelineState_t
;
typedef
void
*
MTLLibrary
;
typedef
void
*
MTLLibrary_t
;
#endif
class
MPSLibrary
{
public:
// disable constructor for singleton
static
MPSLibrary
*
createFromUrl
(
const
std
::
string
&
library_url
);
static
MPSLibrary
*
createFromSource
(
const
std
::
string
&
source
);
~
MPSLibrary
();
MTLLibrary_t
library
()
{
return
_library
;
}
MTLComputePipelineState_t
getComputePipelineState
(
const
std
::
string
&
function_name
);
private:
MTLLibrary_t
_library
;
std
::
unordered_map
<
std
::
string
,
MTLComputePipelineState_t
>
_pso_map
;
};
class
MPSLibraryManager
{
public:
// disable constructor for singleton
MPSLibraryManager
(
const
MPSLibraryManager
&
)
=
delete
;
MPSLibraryManager
&
operator
=
(
const
MPSLibraryManager
&
)
=
delete
;
MPSLibraryManager
(
MPSLibraryManager
&&
)
=
delete
;
MPSLibraryManager
&
operator
=
(
MPSLibraryManager
&&
)
=
delete
;
static
MPSLibraryManager
*
getInstance
();
bool
hasLibrary
(
const
std
::
string
&
name
);
MPSLibrary
*
getLibrary
(
const
std
::
string
&
library_url
);
MPSLibrary
*
createLibraryFromSouce
(
const
std
::
string
&
name
,
const
std
::
string
&
sources
);
~
MPSLibraryManager
();
private:
MPSLibraryManager
();
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
MPSLibrary
>>
_library_map
;
};
#endif
mmcv/ops/csrc/common/mps/MPSLibrary.mm
deleted
100644 → 0
View file @
6f674c7e
#include "MPSLibrary.h"
#include "MPSDevice.h"
static
std
::
unique_ptr
<
MPSLibraryManager
>
mps_library_manager
=
nullptr
;
MPSLibraryManager
*
MPSLibraryManager
::
getInstance
()
{
if
(
!
mps_library_manager
)
mps_library_manager
=
std
::
unique_ptr
<
MPSLibraryManager
>
(
new
MPSLibraryManager
());
return
mps_library_manager
.
get
();
}
MPSLibraryManager
::~
MPSLibraryManager
()
{}
MPSLibraryManager
::
MPSLibraryManager
()
{}
bool
MPSLibraryManager
::
hasLibrary
(
const
std
::
string
&
name
)
{
return
_library_map
.
find
(
name
)
!=
_library_map
.
end
();
}
MPSLibrary
*
MPSLibraryManager
::
getLibrary
(
const
std
::
string
&
library_url
)
{
if
(
_library_map
.
find
(
library_url
)
!=
_library_map
.
end
())
{
return
_library_map
[
library_url
].
get
();
}
_library_map
.
emplace
(
std
::
make_pair
(
library_url
,
std
::
unique_ptr
<
MPSLibrary
>
(
MPSLibrary
::
createFromUrl
(
library_url
))));
return
_library_map
[
library_url
].
get
();
}
MPSLibrary
*
MPSLibraryManager
::
createLibraryFromSouce
(
const
std
::
string
&
name
,
const
std
::
string
&
source
)
{
NSString
*
ns_name
=
[
NSString
stringWithCString
:
name
.
c_str
()];
if
(
_library_map
.
find
(
name
)
!=
_library_map
.
end
())
{
NSLog
(
@"Library %@ already exist."
,
ns_name
);
return
nullptr
;
}
_library_map
.
emplace
(
std
::
make_pair
(
name
,
std
::
unique_ptr
<
MPSLibrary
>
(
MPSLibrary
::
createFromSource
(
source
))));
return
_library_map
[
name
].
get
();
}
MPSLibrary
*
MPSLibrary
::
createFromUrl
(
const
std
::
string
&
library_url
)
{
MPSLibrary
*
library
=
new
MPSLibrary
();
@autoreleasepool
{
NSError
*
error
=
nil
;
// load library and func
NSString
*
utl_str
=
[
NSString
stringWithCString
:
library_url
.
c_str
()];
NSURL
*
metal_url
=
[
NSURL
fileURLWithPath
:
utl_str
];
library
->
_library
=
[
at
:
:
mps
:
:
MPSDevice
::
getInstance
()
->
device
()
newLibraryWithURL
:
metal_url
error:
&
error
];
if
(
library
->
_library
==
nil
)
{
NSLog
(
@"Failed to find library, error %@."
,
error
);
exit
(
1
);
}
}
return
library
;
}
MPSLibrary
*
MPSLibrary
::
createFromSource
(
const
std
::
string
&
sources
)
{
MPSLibrary
*
library
=
new
MPSLibrary
();
@autoreleasepool
{
NSError
*
error
=
nil
;
// load library and func
NSString
*
code_str
=
[
NSString
stringWithCString
:
sources
.
c_str
()];
library
->
_library
=
[
at
:
:
mps
:
:
MPSDevice
::
getInstance
()
->
device
()
newLibraryWithSource
:
code_str
options:
nil
error:
&
error
];
if
(
library
->
_library
==
nil
)
{
NSLog
(
@"Failed to find library, error %@."
,
error
);
exit
(
1
);
}
}
return
library
;
}
MPSLibrary
::~
MPSLibrary
()
{
[
_library
release
];
_library
=
nil
;
}
MTLComputePipelineState_t
MPSLibrary
::
getComputePipelineState
(
const
std
::
string
&
function_name
)
{
if
(
_pso_map
.
find
(
function_name
)
!=
_pso_map
.
end
())
{
return
_pso_map
[
function_name
];
}
MTLComputePipelineState_t
pso
;
@autoreleasepool
{
NSError
*
error
=
nil
;
// create function
NSString
*
function_name_str
=
[
NSString
stringWithCString
:
function_name
.
c_str
()];
id
<
MTLFunction
>
func
=
[
_library
newFunctionWithName
:
function_name_str
];
if
(
func
==
nil
)
{
NSLog
(
@"Failed to created pipeline state object, error %@."
,
error
);
exit
(
1
);
}
// create pipeline
pso
=
[
at
:
:
mps
:
:
MPSDevice
::
getInstance
()
->
device
()
newComputePipelineStateWithFunction
:
func
error:
&
error
];
_pso_map
.
emplace
(
std
::
make_pair
(
function_name
,
pso
));
}
return
_pso_map
[
function_name
];
}
mmcv/ops/csrc/common/mps/MPSStream.h
deleted
100644 → 0
View file @
6f674c7e
// Copyright © 2022 Apple Inc.
// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
#pragma once
#include <cstdint>
#include <utility>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/util/Exception.h>
#include "MPSDevice.h"
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
typedef
id
<
MTLCommandQueue
>
MTLCommandQueue_t
;
typedef
id
<
MTLCommandBuffer
>
MTLCommandBuffer_t
;
typedef
id
<
MTLSharedEvent
>
MTLSharedEvent_t
;
typedef
id
<
MTLDevice
>
MTLDevice_t
;
#else
typedef
void
*
MTLCommandQueue_t
;
typedef
void
*
MTLCommandQueue
;
typedef
void
*
MTLCommandBuffer_t
;
typedef
void
*
MTLCommandBuffer
;
typedef
void
*
MTLSharedEvent_t
;
typedef
void
*
dispatch_queue_t
;
typedef
void
*
MTLDevice_t
;
#define nil NULL;
#endif
namespace
at
{
namespace
mps
{
//-----------------------------------------------------------------
// MPSStream
//-----------------------------------------------------------------
class
TORCH_API
MPSStream
{
public:
enum
Unchecked
{
UNCHECKED
};
/// Construct a MPSStream from a Stream. This construction is checked,
/// and will raise an error if the Stream is not, in fact, a MPS stream.
explicit
MPSStream
(
Stream
stream
);
~
MPSStream
();
MTLCommandQueue_t
commandQueue
()
const
{
return
_commandQueue
;
};
dispatch_queue_t
queue
()
const
{
return
_serialQueue
;
}
MTLCommandBuffer_t
commandBuffer
();
void
commit
(
bool
flush
);
void
commitAndWait
();
void
synchronize
();
void
flush
();
/// Get the MPS device index that this stream is associated with.
c10
::
DeviceIndex
device_index
()
const
{
return
_stream
.
device_index
();
}
MTLCommandQueue_t
stream
()
const
{
return
_commandQueue
;
};
MTLDevice_t
device
()
const
{
return
[
_commandQueue
device
];
}
/// Explicit conversion to Stream.
Stream
unwrap
()
const
{
return
_stream
;
}
private:
Stream
_stream
;
MTLCommandQueue_t
_commandQueue
=
nil
;
MTLCommandBuffer_t
_commandBuffer
=
nil
;
void
_flush
(
bool
commitAndWait
)
const
;
dispatch_queue_t
_serialQueue
=
nullptr
;
};
/**
* Get the current MPS stream
*/
TORCH_API
MPSStream
*
getCurrentMPSStream
();
/**
* Get the default MPS stream
*/
TORCH_API
MPSStream
*
getDefaultMPSStream
();
//-----------------------------------------------------------------
// MPSStreamImpl
//-----------------------------------------------------------------
class
TORCH_API
MPSStreamImpl
{
public:
/**
* Gets single instance of the MPSStream.
*/
static
MPSStream
*
getInstance
();
private:
static
MPSStream
*
_stream
;
MPSStreamImpl
();
};
//-----------------------------------------------------------------
// MPSEvent
//-----------------------------------------------------------------
struct
TORCH_API
MPSEvent
{
MPSEvent
();
// MPSEvent(id<MTLDevice> device);
~
MPSEvent
();
MTLSharedEvent_t
event
()
const
{
return
_event
;
}
void
recordEvent
(
MPSStream
*
stream
);
void
waitForEvent
(
MPSStream
*
queue
);
// waits on the cpu
bool
queryEvent
();
uint64_t
getCurrentValue
()
{
return
_currentValue
;
}
void
setCurrentValue
(
uint64_t
currValue
)
{
_currentValue
=
currValue
;
}
private:
bool
_isRecorded
=
false
;
uint64_t
_currentValue
=
0
;
MTLSharedEvent_t
_event
;
};
typedef
MPSEvent
*
mpsEvent_t
;
}
// namespace mps
}
// namespace at
mmcv/ops/csrc/common/mps/MPSUtils.h
deleted
100644 → 0
View file @
6f674c7e
#ifndef _MPS_UTILS_H_
#define _MPS_UTILS_H_
#include <torch/extension.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef
id
<
MTLBuffer
>
MTLBuffer_t
;
typedef
id
<
MTLComputeCommandEncoder
>
MTLComputeCommandEncoder_t
;
#else
typedef
void
*
MTLBuffer
;
typedef
void
*
MTLBuffer_t
;
typedef
void
*
MTLComputeCommandEncoder
;
typedef
void
*
MTLComputeCommandEncoder_t
;
#endif
// utils
static
inline
MTLBuffer_t
getMTLBufferStorage
(
const
at
::
Tensor
&
tensor
)
{
return
__builtin_bit_cast
(
MTLBuffer_t
,
tensor
.
storage
().
data
());
}
template
<
typename
T
,
std
::
enable_if_t
<!
std
::
is_same
<
std
::
decay_t
<
T
>,
at
::
Tensor
>::
value
,
bool
>
=
true
>
void
setMTLArg
(
MTLComputeCommandEncoder_t
encoder
,
int
index
,
T
&&
t
);
template
<
typename
T
,
std
::
enable_if_t
<
std
::
is_same
<
std
::
decay_t
<
T
>,
at
::
Tensor
>::
value
,
bool
>
=
true
>
void
setMTLArg
(
MTLComputeCommandEncoder_t
encoder
,
int
index
,
T
&&
t
)
{
[
encoder
setBuffer
:
getMTLBufferStorage
(
t
)
offset
:
0
atIndex
:
index
];
}
template
<
typename
T
,
std
::
enable_if_t
<!
std
::
is_same
<
std
::
decay_t
<
T
>,
at
::
Tensor
>::
value
,
bool
>>
void
setMTLArg
(
MTLComputeCommandEncoder_t
encoder
,
int
index
,
T
&&
t
)
{
[
encoder
setBytes
:&
t
length
:
sizeof
(
t
)
atIndex
:
index
];
}
inline
void
setMTLArgsImpl
(
MTLComputeCommandEncoder_t
,
int
)
{}
template
<
typename
T
,
typename
...
Args
>
void
setMTLArgsImpl
(
MTLComputeCommandEncoder_t
encoder
,
int
index
,
T
&&
t
,
Args
&&
...
args
)
{
setMTLArg
(
encoder
,
index
,
std
::
forward
<
T
>
(
t
));
setMTLArgsImpl
(
encoder
,
index
+
1
,
std
::
forward
<
Args
>
(
args
)...);
}
template
<
typename
...
Args
>
void
setMTLArgs
(
MTLComputeCommandEncoder_t
encoder
,
MTLComputePipelineState_t
pso
,
Args
&&
...
args
)
{
[
encoder
setComputePipelineState
:
pso
];
setMTLArgsImpl
(
encoder
,
0
,
std
::
forward
<
Args
>
(
args
)...);
}
#endif
mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
View file @
6f3c5f1c
#ifndef PYTORCH_CPP_HELPER
#define PYTORCH_CPP_HELPER
#include <torch/
types
.h>
#include <torch/
extension
.h>
#include <vector>
using
namespace
at
;
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_MLU(x) \
TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
#define CHECK_CPU(x) \
TORCH_CHECK(x.device().
type() == at::kCPU
, #x " must be a CPU tensor")
TORCH_CHECK(
!
x.device().
is_cuda()
, #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define CHECK_MLU_INPUT(x) \
CHECK_MLU(x); \
CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) \
CHECK_CPU(x); \
CHECK_CONTIGUOUS(x)
...
...
mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
View file @
6f3c5f1c
...
...
@@ -15,6 +15,5 @@ using at::Tensor;
using
phalf
=
at
::
Half
;
#define __PHALF(x) (x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#endif // PYTORCH_CUDA_HELPER
Prev
1
…
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment