Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
8b8bf5e1
Unverified
Commit
8b8bf5e1
authored
Aug 28, 2023
by
Chris Jiang
Committed by
GitHub
Aug 28, 2023
Browse files
[Refactor] Replace roipoint_pool3d op of MLU backend with mlu-ops (#2875)
parent
099ee24d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
42 additions
and
1142 deletions
+42
-1142
mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
...common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
+0
-536
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
+0
-544
mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
+42
-62
No files found.
mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
deleted
100644 → 0
View file @
099ee24d
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
/*************************************************************************
*
* NRAM partition:
* | boxes3d | ping points + pong points | aux_a ~ aux_f |
* | 7 * sizeof(T) | 6 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
*
*************************************************************************/
#define TWELVE_SPLIT 12
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
const size_t deal_num,
T *x,
T *y,
T *z,
T *auxiliary_a,
T *auxiliary_b,
T *auxiliary_c,
T *auxiliary_d,
T *auxiliary_e,
T *auxiliary_f,
T *pts_assign) {
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
T cx = boxes3d[0];
T cy = boxes3d[1];
T cz = boxes3d[2];
T dx = boxes3d[3];
T dy = boxes3d[4];
T dz = boxes3d[5];
T rz = boxes3d[6];
// shift to the center since cz in box3d is the bottom center
cz += 0.5 * dz;
T cosa = (T)std::cos(-rz);
T sina = (T)std::sin(-rz);
// x - cx
__bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
// y - cy
__bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
// z - cz
__bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
// |z - cz|
__bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// |z - cz| > dz / 2.0
#if __BANG_ARCH__ >= 322
__bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
#else
__bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
__bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
#endif
// !(|z - cz| > dz / 2.0)
__bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// (x - cx) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
// (y - cy) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
// local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
__bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
// |local_x|
__bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
// |local_x| < dx / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
#else
__bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
__bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
#endif
// (x - cx) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
// (y - cy) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
// local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
__bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
// |local_y|
__bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
// |local_y| < dy / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
#else
__bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
__bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
#endif
// pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
__bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
__bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
}
template <typename T>
__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (*cnt >= sampled_pts_num) {
return;
}
checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
(T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
(T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (select_num == 0) {
return;
}
int sampled_pts_num_rem = sampled_pts_num - *cnt;
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(
pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
*cnt += select_num;
}
template <typename T>
__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
const size_t auxiliary_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (*cnt >= sampled_pts_num) {
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
return;
}
checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
(T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
(T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (*cnt + select_num == 0) {
// pooled_empty_flag_gdram set 1
*((int *)auxiliary_a) = 1;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
// pooled_features_gdram set 0
int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
// use auxiliary_a to auxiliary_f
__bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
if (repeat > 0) {
__memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram +
box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
repeat * auxiliary_num_deal * 6 * sizeof(T),
auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
}
return;
}
if (select_num > 0) {
int sampled_pts_num_rem = sampled_pts_num - *cnt;
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
}
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
*cnt += select_num;
if (*cnt < sampled_pts_num) {
// duplicate same points for sampling
int repeat = sampled_pts_num / (*cnt) - 1;
int rem = sampled_pts_num % (*cnt);
if (repeat > 0) {
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
(*cnt) * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
(*cnt) * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(
pooled_features_gdram +
(box_idx * sampled_pts_num + (repeat + 1) * (*cnt)) * (3 + feature_in_len) *
sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
if (coreId == 0x80) {
return;
}
size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
size_t boxes_rem = (batch_size * boxes_num) % taskDim;
// calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
int32_t batch_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) / boxes_num :
(taskId * boxes_per_core + boxes_rem) / boxes_num;
int32_t batch_end = taskId < boxes_rem ?
((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
size_t last_batch_box_end = taskId < boxes_rem ?
(taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
// points_xyz : [3, B, N]
const char *points_x_gdram = points_xyz_gdram;
const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
size_t boxes3d_size = PAD_UP(7, NFU_ALIGN_SIZE) * sizeof(T);
size_t span_num_deal = PAD_DOWN(MAX_NRAM_SIZE / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
size_t align_num = NFU_ALIGN_SIZE;
int32_t repeat = pts_num / span_num_deal;
size_t rem = pts_num % span_num_deal;
size_t align_rem = CEIL_ALIGN(rem, align_num);
char *boxes3d = nram_buffer;
char *ping_points_x = nram_buffer + boxes3d_size;
char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
size_t span_load_input1_size = span_num_deal * sizeof(T);
size_t span_load_input2_size = span_num_deal * sizeof(T);
size_t span_load_input3_size = span_num_deal * sizeof(T);
size_t span_load_input4_size = span_num_deal * sizeof(T);
int cnt = 0;
for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
const char *point_features_start =
point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
char *pooled_features_start =
pooled_features_gdram +
(bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
__memcpy_async(boxes3d,
boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T) + box_idx * 7 * sizeof(T),
7 * sizeof(T), GDRAM2NRAM);
cnt = 0;
if (repeat > 0) {
__memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
__memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
__memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
__asm__ volatile("sync;");
}
for (int i = 0; i < repeat - 1; i++) {
__memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
GDRAM2NRAM);
__memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
GDRAM2NRAM);
__memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
GDRAM2NRAM);
computeStoreRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + (i % 2) * ping_pong_gap,
ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
__asm__ volatile("sync;");
}
if (rem > 0) {
if (sizeof(T) == sizeof(float)) {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
} else {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
}
__memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
points_x_start + repeat * span_load_input1_size, rem * sizeof(T),
GDRAM2NRAM);
__memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
points_y_start + repeat * span_load_input2_size, rem * sizeof(T),
GDRAM2NRAM);
__memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
points_z_start + repeat * span_load_input3_size, rem * sizeof(T),
GDRAM2NRAM);
}
if (repeat > 0 && rem > 0) {
computeStoreRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
} else if (repeat > 0 && rem == 0) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
if (rem > 0) {
__asm__ volatile("sync;");
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, &cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
ping_points_y + (repeat % 2) * ping_pong_gap,
ping_points_z + (repeat % 2) * ping_pong_gap,
point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
}
}
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
void KernelRoiPointPool3dLargeBoxesNumForward(cnrtDim3_t k_dim,
cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const void *points_xyz,
const void *boxes3d,
const void *point_features,
void *pooled_features,
int *pooled_empty_flag) {
switch (d_type) {
default: { break; }
case CNRT_FLOAT32: {
MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
}
}
mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
099ee24d
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
/**************************************************************************************
*
* NRAM partition:
* | boxes3d | cnt |
* | boxes_num * 7 * sizeof(T) | boxes_num * sizeof(int) |
*
* | ping points | pong points | aux_a ~ aux_f |
* | 3 * deal_num * sizeof(T) | 3 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
*
***************************************************************************************/
#define TWELVE_SPLIT 12
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
const size_t deal_num,
T *x,
T *y,
T *z,
T *auxiliary_a,
T *auxiliary_b,
T *auxiliary_c,
T *auxiliary_d,
T *auxiliary_e,
T *auxiliary_f,
T *pts_assign) {
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
T cx = boxes3d[0];
T cy = boxes3d[1];
T cz = boxes3d[2];
T dx = boxes3d[3];
T dy = boxes3d[4];
T dz = boxes3d[5];
T rz = boxes3d[6];
// shift to the center since cz in box3d is the bottom center
cz += 0.5 * dz;
T cosa = (T)std::cos(-rz);
T sina = (T)std::sin(-rz);
// x - cx
__bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
// y - cy
__bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
// z - cz
__bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
// |z - cz|
__bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// |z - cz| > dz / 2.0
#if __BANG_ARCH__ >= 322
__bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
#else
__bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
__bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
#endif
// !(|z - cz| > dz / 2.0)
__bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
// (x - cx) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
// (y - cy) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
// local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
__bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
// |local_x|
__bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
// |local_x| < dx / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
#else
__bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
__bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
#endif
// (x - cx) * sin(-rz)
__bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
// (y - cy) * cos(-rz)
__bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
// local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
__bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
// |local_y|
__bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
// |local_y| < dy / 2.0
#if __BANG_ARCH__ >= 322
__bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
#else
__bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
__bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
#endif
// pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
__bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
__bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
}
template <typename T>
__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (cnt[box_idx] >= sampled_pts_num) {
return;
}
checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
(T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
(T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
(T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (select_num == 0) {
return;
}
int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
cnt[box_idx] += select_num;
}
template <typename T>
__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
int *cnt,
char *points_x,
char *points_y,
char *points_z,
const char *point_features,
char *auxiliary_a,
char *auxiliary_b,
char *auxiliary_c,
char *auxiliary_d,
char *auxiliary_e,
char *auxiliary_f,
const int box_idx,
const int pts_num,
const int feature_in_len,
const int sampled_pts_num,
const size_t span_num_deal,
const size_t auxiliary_num_deal,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
char *pts_assign = auxiliary_a;
if (cnt[box_idx] >= sampled_pts_num) {
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
return;
}
checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
(T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
(T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
(T *)pts_assign);
// __bang_select returns selected elements vector and the number of selected elements
__bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
uint32_t select_num = *((uint32_t *)auxiliary_b);
if (cnt[box_idx] + select_num == 0) {
// pooled_empty_flag_gdram set 1
*((int *)auxiliary_a) = 1;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
// pooled_features_gdram set 0
int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
// use auxiliary_a to auxiliary_f
__bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
if (repeat > 0) {
__memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram +
box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
repeat * auxiliary_num_deal * 6 * sizeof(T),
auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
}
return;
}
if (select_num > 0) {
int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
// copy x to pooled_features_gdram
// The result of __bang_select is composed of three parts:
// The first 4-byte is the number of selected element, whose data type is unsigned int.
// The next 124-byte is zero. The rest bytes are the selected elements.
int select_num_size = 128;
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
(T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
1 * sizeof(T),
(T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
2 * sizeof(T),
(T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
// copy features to pooled_features_gdram
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
(3 + c_idx) * sizeof(T),
auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
segnum);
}
}
// pooled_empty_flag_gdram set 0
*((int *)auxiliary_a) = 0;
__memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
cnt[box_idx] += select_num;
if (cnt[box_idx] < sampled_pts_num) {
// duplicate same points for sampling
int repeat = sampled_pts_num / cnt[box_idx] - 1;
int rem = sampled_pts_num % cnt[box_idx];
if (repeat > 0) {
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
cnt[box_idx] * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
cnt[box_idx] * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
}
if (rem > 0) {
__memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + (repeat + 1) * cnt[box_idx]) *
(3 + feature_in_len) * sizeof(T),
pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelRoiPointPool3dForward(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram) {
if (coreId == 0x80) {
return;
}
size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
size_t boxes_rem = (batch_size * boxes_num) % taskDim;
// calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
int32_t batch_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) / boxes_num :
(taskId * boxes_per_core + boxes_rem) / boxes_num;
int32_t batch_end = taskId < boxes_rem ?
((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
(taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
size_t last_batch_box_end = taskId < boxes_rem ?
(taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
// points_xyz : [3, B, N]
const char *points_x_gdram = points_xyz_gdram;
const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
size_t boxes3d_size = PAD_UP(boxes_num * 7, NFU_ALIGN_SIZE) * sizeof(T);
size_t cnt_size = PAD_UP(boxes_num, NFU_ALIGN_SIZE) * sizeof(int);
size_t span_num_deal = PAD_DOWN(
(MAX_NRAM_SIZE - boxes3d_size - cnt_size) / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
size_t align_num = NFU_ALIGN_SIZE;
int32_t repeat = pts_num / span_num_deal;
size_t rem = pts_num % span_num_deal;
size_t align_rem = CEIL_ALIGN(rem, align_num);
char *boxes3d = nram_buffer;
char *cnt = nram_buffer + boxes3d_size;
char *ping_points_x = cnt + cnt_size;
char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
size_t span_load_input1_size = span_num_deal * sizeof(T);
size_t span_load_input2_size = span_num_deal * sizeof(T);
size_t span_load_input3_size = span_num_deal * sizeof(T);
size_t span_load_input4_size = span_num_deal * sizeof(T);
for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
__memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T),
boxes_num * 7 * sizeof(T), GDRAM2NRAM);
__bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE));
const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
const char *point_features_start =
point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
char *pooled_features_start =
pooled_features_gdram +
(bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
if (repeat > 0) {
__memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
__memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
__memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
__asm__ volatile("sync;");
}
for (int i = 0; i < repeat - 1; i++) {
__memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
GDRAM2NRAM);
__memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
GDRAM2NRAM);
__memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
GDRAM2NRAM);
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + (i % 2) * ping_pong_gap,
ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
}
__asm__ volatile("sync;");
}
if (rem > 0) {
if (sizeof(T) == sizeof(float)) {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
} else {
__bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
__bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
NFU_ALIGN_SIZE, (T)NAN);
}
__memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
points_x_start + repeat * span_load_input1_size, rem * sizeof(T), GDRAM2NRAM);
__memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
points_y_start + repeat * span_load_input2_size, rem * sizeof(T), GDRAM2NRAM);
__memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
points_z_start + repeat * span_load_input3_size, rem * sizeof(T), GDRAM2NRAM);
}
if (repeat > 0 && rem > 0) {
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
}
} else if (repeat > 0 && rem == 0) {
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
if (rem > 0) {
__asm__ volatile("sync;");
for (int box_idx = box_start; box_idx < box_end; box_idx++) {
computeStoreLastBlockRoipointPool3d<T>(
boxes3d, (int *)cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
ping_points_y + (repeat % 2) * ping_pong_gap,
ping_points_z + (repeat % 2) * ping_pong_gap,
point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
pooled_empty_flag_start);
}
}
}
}
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<float>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<half>(
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const char *points_xyz_gdram,
const char *point_features_gdram,
const char *boxes3d_gdram,
char *pooled_features_gdram,
char *pooled_empty_flag_gdram);
void KernelRoiPointPool3dForward(cnrtDim3_t k_dim,
cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const int batch_size,
const int pts_num,
const int boxes_num,
const int feature_in_len,
const int sampled_pts_num,
const void *points_xyz,
const void *boxes3d,
const void *point_features,
void *pooled_features,
int *pooled_empty_flag) {
switch (d_type) {
default: { break; }
case CNRT_FLOAT32: {
MLUUnion1KernelRoiPointPool3dForward<float><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
case CNRT_FLOAT16: {
MLUUnion1KernelRoiPointPool3dForward<half><<<k_dim, k_type, queue>>>(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
(char *)points_xyz, (char *)point_features, (char *)boxes3d,
(char *)pooled_features, (char *)pooled_empty_flag);
}; break;
}
}
mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
View file @
8b8bf5e1
...
@@ -9,32 +9,7 @@
...
@@ -9,32 +9,7 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "mlu_common_helper.h"
#include "pytorch_mlu_helper.hpp"
void
KernelRoiPointPool3dForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
int
batch_size
,
const
int
pts_num
,
const
int
boxes_num
,
const
int
feature_in_len
,
const
int
sampled_pts_num
,
const
void
*
xyz
,
const
void
*
boxes3d
,
const
void
*
pts_feature
,
void
*
pooled_features
,
int
*
pooled_empty_flag
);
void
KernelRoiPointPool3dLargeBoxesNumForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
int
batch_size
,
const
int
pts_num
,
const
int
boxes_num
,
const
int
feature_in_len
,
const
int
sampled_pts_num
,
const
void
*
xyz
,
const
void
*
boxes3d
,
const
void
*
pts_feature
,
void
*
pooled_features
,
int
*
pooled_empty_flag
);
// policy function
static
void
policyFuncForward
(
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
)
{
// start U1 task, occupy all available clusters
k_dim
->
x
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
k_dim
->
y
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
k_dim
->
z
=
1
;
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
}
void
RoIPointPool3dForwardMLUKernelLauncher
(
void
RoIPointPool3dForwardMLUKernelLauncher
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
...
@@ -98,50 +73,55 @@ void RoIPointPool3dForwardMLUKernelLauncher(
...
@@ -98,50 +73,55 @@ void RoIPointPool3dForwardMLUKernelLauncher(
"pts_feature element num should be less than 2^31, got "
,
"pts_feature element num should be less than 2^31, got "
,
pts_feature
.
numel
(),
"."
);
pts_feature
.
numel
(),
"."
);
// calculate task dimension
// set contiguous
cnrtDim3_t
k_dim
;
auto
xyz_contiguous
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
cnrtFunctionType_t
k_type
;
xyz
,
xyz
.
suggest_memory_format
());
policyFuncForward
(
&
k_dim
,
&
k_type
);
auto
pts_feature_contiguous
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
pts_feature
,
pts_feature
.
suggest_memory_format
());
// get compute queue
auto
boxes3d_contiguous
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
auto
queue
=
torch_mlu
::
getCurQueue
();
boxes3d
,
boxes3d
.
suggest_memory_format
());
auto
pooled_features_contiguous
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
pooled_features
,
pooled_features
.
suggest_memory_format
());
auto
pooled_empty_flag_contiguous
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
pooled_empty_flag
,
pooled_empty_flag
.
suggest_memory_format
());
// get ptr of tensors
// get ptr of tensors
// transpose points [B, N ,3] -> [3, B, N]
auto
xyz_impl
=
torch_mlu
::
getMluTensorImpl
(
xyz_contiguous
);
auto
xyz_
=
xyz
.
permute
({
2
,
0
,
1
}).
contiguous
();
auto
xyz_impl
=
torch_mlu
::
getMluTensorImpl
(
xyz_
);
auto
xyz_ptr
=
xyz_impl
->
cnnlMalloc
();
auto
xyz_ptr
=
xyz_impl
->
cnnlMalloc
();
// transpose point_features [B, N, C] -> [B, C, N]
auto
pts_feature_impl
=
torch_mlu
::
getMluTensorImpl
(
pts_feature_contiguous
);
auto
pts_feature_
=
pts_feature
.
permute
({
0
,
2
,
1
}).
contiguous
();
auto
pts_feature_impl
=
torch_mlu
::
getMluTensorImpl
(
pts_feature_
);
auto
pts_feature_ptr
=
pts_feature_impl
->
cnnlMalloc
();
auto
pts_feature_ptr
=
pts_feature_impl
->
cnnlMalloc
();
auto
boxes3d_impl
=
torch_mlu
::
getMluTensorImpl
(
boxes3d
);
auto
boxes3d_impl
=
torch_mlu
::
getMluTensorImpl
(
boxes3d
_contiguous
);
auto
boxes3d_ptr
=
boxes3d_impl
->
cnnlMalloc
();
auto
boxes3d_ptr
=
boxes3d_impl
->
cnnlMalloc
();
auto
pooled_features_impl
=
torch_mlu
::
getMluTensorImpl
(
pooled_features
);
auto
pooled_features_impl
=
torch_mlu
::
getMluTensorImpl
(
pooled_features
_contiguous
);
auto
pooled_features_ptr
=
pooled_features_impl
->
cnnlMalloc
();
auto
pooled_features_ptr
=
pooled_features_impl
->
cnnlMalloc
();
auto
pooled_empty_flag_impl
=
torch_mlu
::
getMluTensorImpl
(
pooled_empty_flag
);
auto
pooled_empty_flag_impl
=
torch_mlu
::
getMluTensorImpl
(
pooled_empty_flag
_contiguous
);
auto
pooled_empty_flag_ptr
=
pooled_empty_flag_impl
->
cnnlMalloc
();
auto
pooled_empty_flag_ptr
=
pooled_empty_flag_impl
->
cnnlMalloc
();
// get compute dtype of input
// create tensor descriptors
cnrtDataType_t
data_type
=
torch_mlu
::
toCnrtDtype
(
xyz_
.
dtype
());
MluOpTensorDescriptor
xyz_desc
,
pts_feature_desc
,
boxes3d_desc
,
pooled_features_desc
,
pooled_empty_flag_desc
;
xyz_desc
.
set
(
xyz_contiguous
);
// launch kernel
pts_feature_desc
.
set
(
pts_feature_contiguous
);
if
(
boxes_num
<=
10240
)
{
boxes3d_desc
.
set
(
boxes3d_contiguous
);
CNLOG
(
INFO
)
<<
"Launch Kernel MLUKernelRoiPointPool3dForward<<<"
<<
k_dim
.
x
pooled_features_desc
.
set
(
pooled_features_contiguous
);
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
pooled_empty_flag_desc
.
set
(
pooled_empty_flag_contiguous
);
KernelRoiPointPool3dForward
(
k_dim
,
k_type
,
queue
,
data_type
,
batch_size
,
pts_num
,
boxes_num
,
// get workspace
feature_in_len
,
sampled_pts_num
,
xyz_ptr
,
boxes3d_ptr
,
pts_feature_ptr
,
size_t
workspace_size
=
0
;
pooled_features_ptr
,
(
int
*
)
pooled_empty_flag_ptr
);
auto
handle
=
mluOpGetCurrentHandle
();
}
else
{
TORCH_MLUOP_CHECK
(
mluOpGetRoiPointPool3dWorkspaceSize
(
handle
,
batch_size
,
CNLOG
(
INFO
)
pts_num
,
boxes_num
,
feature_in_len
,
sampled_pts_num
,
xyz_desc
.
desc
(),
<<
"Launch Kernel MLUKernelRoiPointPool3dLargeBoxesNumForward<<<"
pts_feature_desc
.
desc
(),
boxes3d_desc
.
desc
(),
pooled_features_desc
.
desc
(),
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
pooled_empty_flag_desc
.
desc
(),
&
workspace_size
));
KernelRoiPointPool3dLargeBoxesNumForward
(
k_dim
,
k_type
,
queue
,
data_type
,
batch_size
,
pts_num
,
boxes_num
,
auto
workspace
=
at
::
empty
(
workspace_size
,
xyz
.
options
().
dtype
(
at
::
kByte
));
feature_in_len
,
sampled_pts_num
,
xyz_ptr
,
boxes3d_ptr
,
pts_feature_ptr
,
auto
workspace_impl
=
torch_mlu
::
getMluTensorImpl
(
workspace
);
pooled_features_ptr
,
(
int
*
)
pooled_empty_flag_ptr
);
auto
workspace_ptr
=
workspace_impl
->
cnnlMalloc
();
}
TORCH_MLUOP_CHECK
(
mluOpRoiPointPool3d
(
handle
,
batch_size
,
pts_num
,
boxes_num
,
feature_in_len
,
sampled_pts_num
,
xyz_desc
.
desc
(),
xyz_ptr
,
pts_feature_desc
.
desc
(),
pts_feature_ptr
,
boxes3d_desc
.
desc
(),
boxes3d_ptr
,
workspace_ptr
,
workspace_size
,
pooled_features_desc
.
desc
(),
pooled_features_ptr
,
pooled_empty_flag_desc
.
desc
(),
(
int
*
)
pooled_empty_flag_ptr
));
}
}
void
roipoint_pool3d_forward_mlu
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
void
roipoint_pool3d_forward_mlu
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment