Unverified Commit b586cc2f authored by Jiazhen Wang's avatar Jiazhen Wang Committed by GitHub
Browse files

[Refactor] Replace DIVUP with GET_BLOCKS (#1586)

* [Improve] migrating DIVUP to GET_BLOCKS

* [Fix] use GET_BLOCKS only for block alloc and del useless statements

* [Fix] add kernel loop for nms and del useless statements
parent cf754db9
...@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel( ...@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
const int O, const int aggregate, const T* points, const T* centers, const int O, const int aggregate, const T* points, const T* centers,
const T* scores, const int64_t* knn_idx, T* output) { const T* scores, const int64_t* knn_idx, T* output) {
// ----- parallel loop for B, N1, K and O --------- // ----- parallel loop for B, N1, K and O ---------
long i = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
if (i >= B * N1 * K * O) return; // ------- loop for M ----------
// ------- loop for M ---------- const int b = (int)(i / (O * N1 * K));
const int b = (int)(i / (O * N1 * K)); const int o = (int)(i % (O * N1 * K) / (N1 * K));
const int o = (int)(i % (O * N1 * K) / (N1 * K)); const int n = (int)(i % (N1 * K) / K);
const int n = (int)(i % (N1 * K) / K); const int k = (int)(i % K);
const int k = (int)(i % K); const int cn = (int)knn_idx[b * K * N1 + n * K +
const int cn = (int)knn_idx[b * K * N1 + n * K + 0]; // The first neighbor is the center point
0]; // The first neighbor is the center point const int kn = (int)knn_idx[b * K * N1 + n * K + k];
const int kn = (int)knn_idx[b * K * N1 + n * K + k]; if (kn >= N0 ||
if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
kn < 0) { // if index overflows, it is out of the neighborhood range return;
return; }
} assert(b < B);
assert(b < B); assert(kn < N0);
assert(kn < N0); assert(cn < N0);
assert(cn < N0); assert(o < O);
assert(o < O); assert(n < N1);
assert(n < N1); const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; T val = output[out_idx];
T val = output[out_idx]; for (int m = 0; m < M; m++) {
for (int m = 0; m < M; m++) { val += points[b * N0 * M * O + kn * M * O + m * O + o] *
val += points[b * N0 * M * O + kn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m] -
scores[b * N1 * K * M + n * K * M + k * M + m] - centers[b * N0 * M * O + cn * M * O + m * O + o] *
centers[b * N0 * M * O + cn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m];
scores[b * N1 * K * M + n * K * M + k * M + m]; }
output[out_idx] = val;
} }
output[out_idx] = val;
} }
template <typename T> template <typename T>
...@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel( ...@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* scores, const int O, const int aggregate, const T* grad_out, const T* scores,
const int64_t* knn_idx, T* grad_points, T* grad_centers) { const int64_t* knn_idx, T* grad_points, T* grad_centers) {
// ----- parallel loop for B, M, O --------- // ----- parallel loop for B, M, O ---------
long i = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(i, B * M * O) {
if (i >= B * M * O) return; int b = (int)(i / (M * O));
int b = (int)(i / (M * O)); int m = (int)(i % (M * O) / O);
int m = (int)(i % (M * O) / O); int o = (int)(i % O);
int o = (int)(i % O);
// ----- loop for N,K --------- // ----- loop for N,K ---------
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k++) { for (int k = 0; k < K; k++) {
int kn = knn_idx[b * N * K + n * K + k]; int kn = knn_idx[b * N * K + n * K + k];
int cn = knn_idx[b * N * K + n * K + 0]; int cn = knn_idx[b * N * K + n * K + 0];
if (kn >= N0 || if (kn >= N0 || kn < 0) { // if index overflows, it is out of the
kn < 0) { // if index overflows, it is out of the neighborhood range // neighborhood range
continue; continue;
}
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
} }
atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-scores[b * N * K * M + n * K * M + k * M + m] *
grad_out[b * O * N * K + o * N * K + n * K + k]);
} }
} }
} }
...@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel( ...@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
const int O, const int aggregate, const T* grad_out, const T* points, const int O, const int aggregate, const T* grad_out, const T* points,
const T* centers, const int64_t* knn_idx, T* grad_scores) { const T* centers, const int64_t* knn_idx, T* grad_scores) {
// ----- parallel loop for B, N, K, M --------- // ----- parallel loop for B, N, K, M ---------
long i = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
if (i >= B * N * K * M) return; const int b = (int)(i / (N * M * K));
const int b = (int)(i / (N * M * K)); const int n = (int)(i % (N * M * K) / M / K);
const int n = (int)(i % (N * M * K) / M / K); const int k = (int)(i % (M * K) / M);
const int k = (int)(i % (M * K) / M); const int m = (int)(i % M);
const int m = (int)(i % M); const int cn = knn_idx[b * N * K + n * K + 0];
const int cn = knn_idx[b * N * K + n * K + 0]; const int kn = knn_idx[b * N * K + n * K + k];
const int kn = knn_idx[b * N * K + n * K + k]; if (kn >= N0 ||
if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range
kn < 0) { // if index overflows, it is out of the neighborhood range return;
return; }
}
// -------------- loop for O ------------------------ // -------------- loop for O ------------------------
const int out_idx = b * N * K * M + n * K * M + k * M + m; const int out_idx = b * N * K * M + n * K * M + k * M + m;
T val = grad_scores[out_idx]; T val = grad_scores[out_idx];
for (int o = 0; o < O; o++) { for (int o = 0; o < O; o++) {
val += (points[b * N0 * M * O + kn * M * O + m * O + o] - val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
centers[b * N0 * M * O + cn * M * O + m * O + o]) * centers[b * N0 * M * O + cn * M * O + m * O + o]) *
grad_out[b * O * N * K + o * N * K + n * K + k]; grad_out[b * O * N * K + o * N * K + n * K + k];
}
grad_scores[out_idx] = val;
} }
grad_scores[out_idx] = val;
} }
#endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH #endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
...@@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m, ...@@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
// output: // output:
// idx: (B, M, nsample) // idx: (B, M, nsample)
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || pt_idx >= m) return; if (bs_idx >= b) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3; new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3; xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample; idx += bs_idx * m * nsample + pt_idx * nsample;
float max_radius2 = max_radius * max_radius; float max_radius2 = max_radius * max_radius;
float min_radius2 = min_radius * min_radius; float min_radius2 = min_radius * min_radius;
T new_x = new_xyz[0]; T new_x = new_xyz[0];
T new_y = new_xyz[1]; T new_y = new_xyz[1];
T new_z = new_xyz[2]; T new_z = new_xyz[2];
int cnt = 0; int cnt = 0;
for (int k = 0; k < n; ++k) { for (int k = 0; k < n; ++k) {
T x = xyz[k * 3 + 0]; T x = xyz[k * 3 + 0];
T y = xyz[k * 3 + 1]; T y = xyz[k * 3 + 1];
T z = xyz[k * 3 + 2]; T z = xyz[k * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z); (new_z - z) * (new_z - z);
if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
if (cnt == 0) { if (cnt == 0) {
for (int l = 0; l < nsample; ++l) { for (int l = 0; l < nsample; ++l) {
idx[l] = k; idx[l] = k;
}
} }
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
} }
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
} }
} }
} }
......
...@@ -7,12 +7,20 @@ ...@@ -7,12 +7,20 @@
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x) i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512 #define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
j += blockDim.y * gridDim.y)
#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define THREADS_PER_BLOCK 512
inline int GET_BLOCKS(const int N) { inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; int optimal_block_num = (N + num_threads - 1) / num_threads;
int max_block_num = 4096; int max_block_num = 4096;
return min(optimal_block_num, max_block_num); return min(optimal_block_num, max_block_num);
} }
......
...@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m, ...@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; if (bs_idx >= b || c_idx >= c) return;
out += bs_idx * c * m + c_idx * m + pt_idx; out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx; idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n; points += bs_idx * c * n + c_idx * n;
out[0] = points[idx[0]]; out[0] = points[idx[0]];
}
} }
template <typename T> template <typename T>
...@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m, ...@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; if (bs_idx >= b || c_idx >= c) return;
grad_out += bs_idx * c * m + c_idx * m + pt_idx; grad_out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx; idx += bs_idx * m + pt_idx;
grad_points += bs_idx * c * n + c_idx * n; grad_points += bs_idx * c * n + c_idx * n;
atomicAdd(grad_points + idx[0], grad_out[0]); atomicAdd(grad_points + idx[0], grad_out[0]);
}
} }
#endif // GATHER_POINTS_CUDA_KERNEL_CUH #endif // GATHER_POINTS_CUDA_KERNEL_CUH
...@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n, ...@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
// out: (B, C, npoints, nsample) // out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c) return;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample; int pt_idx = index / nsample;
int sample_idx = index % nsample;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx; pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx]; out[out_idx] = points[in_idx];
}
} }
template <typename T> template <typename T>
...@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n, ...@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
// grad_points: (B, C, N) // grad_points: (B, C, N)
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
int pt_idx = index / nsample; int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; if (bs_idx >= b || c_idx >= c) return;
int sample_idx = index % nsample; int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx; pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
}
} }
#endif // GROUP_POINTS_CUDA_KERNEL_CUH #endif // GROUP_POINTS_CUDA_KERNEL_CUH
...@@ -220,16 +220,15 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) { ...@@ -220,16 +220,15 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel( __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
const int num_a, const float *boxes_a, const int num_b, const int num_a, const float *boxes_a, const int num_b,
const float *boxes_b, float *ans_overlap) { const float *boxes_b, float *ans_overlap) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; if (a_idx >= num_a || b_idx >= num_b) {
return;
if (a_idx >= num_a || b_idx >= num_b) { }
return; const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float s_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = s_overlap;
} }
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float s_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = s_overlap;
} }
__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a, __global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
...@@ -237,17 +236,16 @@ __global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a, ...@@ -237,17 +236,16 @@ __global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
const int num_b, const int num_b,
const float *boxes_b, const float *boxes_b,
float *ans_iou) { float *ans_iou) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; if (a_idx >= num_a || b_idx >= num_b) {
return;
}
if (a_idx >= num_a || b_idx >= num_b) { const float *cur_box_a = boxes_a + a_idx * 5;
return; const float *cur_box_b = boxes_b + b_idx * 5;
float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
} }
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
} }
__global__ void nms_forward_cuda_kernel(const int boxes_num, __global__ void nms_forward_cuda_kernel(const int boxes_num,
...@@ -256,50 +254,51 @@ __global__ void nms_forward_cuda_kernel(const int boxes_num, ...@@ -256,50 +254,51 @@ __global__ void nms_forward_cuda_kernel(const int boxes_num,
unsigned long long *mask) { unsigned long long *mask) {
// params: boxes (N, 5) [x1, y1, x2, y2, ry] // params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS) // params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
const int row_start = blockIdx.y; if (threadIdx.x < row_size) {
const int col_start = blockIdx.x; const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5;
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5;
int i = 0; int i = 0;
unsigned long long t = 0; unsigned long long t = 0;
int start = 0; int start = 0;
if (row_start == col_start) { if (row_start == col_start) {
start = threadIdx.x + 1; start = threadIdx.x + 1;
} }
for (i = start; i < col_size; i++) { for (i = start; i < col_size; i++) {
if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i; t |= 1ULL << i;
}
} }
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
mask[cur_box_idx * col_blocks + col_start] = t;
} }
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
mask[cur_box_idx * col_blocks + col_start] = t;
} }
} }
...@@ -320,49 +319,51 @@ __global__ void nms_normal_forward_cuda_kernel(const int boxes_num, ...@@ -320,49 +319,51 @@ __global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
// params: boxes (N, 5) [x1, y1, x2, y2, ry] // params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS) // params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int row_start = blockIdx.y; const int blocks =
const int col_start = blockIdx.x; (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
// if (row_start > col_start) return; // if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS); THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS); THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) { if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] = block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] = block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] = block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] = block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] = block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
} }
__syncthreads(); __syncthreads();
if (threadIdx.x < row_size) { if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5; const float *cur_box = boxes + cur_box_idx * 5;
int i = 0; int i = 0;
unsigned long long t = 0; unsigned long long t = 0;
int start = 0; int start = 0;
if (row_start == col_start) { if (row_start == col_start) {
start = threadIdx.x + 1; start = threadIdx.x + 1;
} }
for (i = start; i < col_size; i++) { for (i = start; i < col_size; i++) {
if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i; t |= 1ULL << i;
}
} }
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
mask[cur_box_idx * col_blocks + col_start] = t;
} }
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
mask[cur_box_idx * col_blocks + col_start] = t;
} }
} }
......
...@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample, ...@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
const T *xyz, const T *new_xyz, const T *xyz, const T *new_xyz,
int *__restrict__ idx, T *dist2) { int *__restrict__ idx, T *dist2) {
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || pt_idx >= m) return; if (bs_idx >= b) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3; new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3; xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample; idx += bs_idx * m * nsample + pt_idx * nsample;
dist2 += bs_idx * m * nsample + pt_idx * nsample; dist2 += bs_idx * m * nsample + pt_idx * nsample;
T new_x = new_xyz[0]; T new_x = new_xyz[0];
T new_y = new_xyz[1]; T new_y = new_xyz[1];
T new_z = new_xyz[2]; T new_z = new_xyz[2];
float best_dist[100]; float best_dist[100];
int best_idx[100]; int best_idx[100];
for (int i = 0; i < nsample; i++) { for (int i = 0; i < nsample; i++) {
best_dist[i] = 1e10; best_dist[i] = 1e10;
best_idx[i] = 0; best_idx[i] = 0;
} }
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
T x = xyz[i * 3 + 0]; T x = xyz[i * 3 + 0];
T y = xyz[i * 3 + 1]; T y = xyz[i * 3 + 1];
T z = xyz[i * 3 + 2]; T z = xyz[i * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z); (new_z - z) * (new_z - z);
if (d2 < best_dist[0]) { if (d2 < best_dist[0]) {
best_dist[0] = d2; best_dist[0] = d2;
best_idx[0] = i; best_idx[0] = i;
reheap(best_dist, best_idx, nsample); reheap(best_dist, best_idx, nsample);
}
}
heap_sort(best_dist, best_idx, nsample);
for (int i = 0; i < nsample; i++) {
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
} }
}
heap_sort(best_dist, best_idx, nsample);
for (int i = 0; i < nsample; i++) {
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
} }
} }
......
...@@ -15,9 +15,6 @@ ...@@ -15,9 +15,6 @@
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
const int CUDA_NUM_THREADS = 1024; const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads) {
return (N + num_threads - 1) / num_threads;
}
template <typename scalar_t> template <typename scalar_t>
__device__ scalar_t ms_deform_attn_im2col_bilinear( __device__ scalar_t ms_deform_attn_im2col_bilinear(
......
...@@ -30,45 +30,46 @@ __device__ inline bool devIoU(float const *const a, float const *const b, ...@@ -30,45 +30,46 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
__global__ void nms_cuda(const int n_boxes, const float iou_threshold, __global__ void nms_cuda(const int n_boxes, const float iou_threshold,
const int offset, const float *dev_boxes, const int offset, const float *dev_boxes,
unsigned long long *dev_mask) { unsigned long long *dev_mask) {
const int row_start = blockIdx.y; int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
const int col_start = blockIdx.x; CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
const int tid = threadIdx.x; const int tid = threadIdx.x;
if (row_start > col_start) return; if (row_start > col_start) return;
const int row_size = const int row_size =
fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size = const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 4]; __shared__ float block_boxes[threadsPerBlock * 4];
if (tid < col_size) { if (tid < col_size) {
block_boxes[tid * 4 + 0] = block_boxes[tid * 4 + 0] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
block_boxes[tid * 4 + 1] = block_boxes[tid * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
block_boxes[tid * 4 + 2] = block_boxes[tid * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
block_boxes[tid * 4 + 3] = block_boxes[tid * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
}
__syncthreads();
if (tid < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + tid;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long int t = 0;
int start = 0;
if (row_start == col_start) {
start = tid + 1;
} }
for (i = start; i < col_size; i++) { __syncthreads();
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
t |= 1ULL << i; if (tid < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + tid;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long int t = 0;
int start = 0;
if (row_start == col_start) {
start = tid + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
t |= 1ULL << i;
}
} }
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
} }
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
} }
} }
#endif // NMS_CUDA_KERNEL_CUH #endif // NMS_CUDA_KERNEL_CUH
...@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel( ...@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
// (B, npoints), default -1 // (B, npoints), default -1
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (bs_idx >= batch_size || pt_idx >= pts_num) return; if (bs_idx >= batch_size) return;
boxes += bs_idx * boxes_num * 7; boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3; pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx; box_idx_of_points += bs_idx * pts_num + pt_idx;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
int cur_in_flag = 0; int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) { for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) { if (cur_in_flag) {
box_idx_of_points[0] = k; box_idx_of_points[0] = k;
break; break;
}
} }
} }
} }
...@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel( ...@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
// (B, npoints), default -1 // (B, npoints), default -1
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (bs_idx >= batch_size || pt_idx >= pts_num) return; if (bs_idx >= batch_size) return;
boxes += bs_idx * boxes_num * 7; boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3; pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
for (int k = 0; k < boxes_num; k++) { for (int k = 0; k < boxes_num; k++) {
const int cur_in_flag = const int cur_in_flag =
check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) { if (cur_in_flag) {
box_idx_of_points[k] = 1; box_idx_of_points[k] = 1;
}
} }
} }
} }
......
...@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, ...@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
// coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N, // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
// npoints): -1 means point does not in this box, otherwise: encode (x_idxs, // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
// y_idxs, z_idxs) by binary bit // y_idxs, z_idxs) by binary bit
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y; int box_idx = blockIdx.y;
if (pt_idx >= pts_num || box_idx >= boxes_num) return; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (box_idx >= boxes_num) return;
pts += pt_idx * 3; pts += pt_idx * 3;
rois += box_idx * 7; rois += box_idx * 7;
pts_mask += box_idx * pts_num + pt_idx; pts_mask += box_idx * pts_num + pt_idx;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
pts_mask[0] = -1; pts_mask[0] = -1;
if (cur_in_flag > 0) { if (cur_in_flag > 0) {
T local_z = pts[2] - rois[2]; T local_z = pts[2] - rois[2];
T x_size = rois[3], y_size = rois[4], z_size = rois[5]; T x_size = rois[3], y_size = rois[4], z_size = rois[5];
T x_res = x_size / out_x; T x_res = x_size / out_x;
T y_res = y_size / out_y; T y_res = y_size / out_y;
T z_res = z_size / out_z; T z_res = z_size / out_z;
unsigned int x_idx = int((local_x + x_size / 2) / x_res); unsigned int x_idx = int((local_x + x_size / 2) / x_res);
unsigned int y_idx = int((local_y + y_size / 2) / y_res); unsigned int y_idx = int((local_y + y_size / 2) / y_res);
unsigned int z_idx = int(local_z / z_res); unsigned int z_idx = int(local_z / z_res);
x_idx = min(max(x_idx, 0), out_x - 1); x_idx = min(max(x_idx, 0), out_x - 1);
y_idx = min(max(y_idx, 0), out_y - 1); y_idx = min(max(y_idx, 0), out_y - 1);
z_idx = min(max(z_idx, 0), out_z - 1); z_idx = min(max(z_idx, 0), out_z - 1);
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
pts_mask[0] = idx_encoding; pts_mask[0] = idx_encoding;
}
} }
} }
...@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, ...@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
T *pts_idx_of_voxels) { T *pts_idx_of_voxels) {
// params pts_mask: (N, npoints) 0 or 1 // params pts_mask: (N, npoints) 0 or 1
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
int box_idx = blockIdx.x * blockDim.x + threadIdx.x; int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter
if (box_idx >= boxes_num) return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter for (int k = 0; k < pts_num; k++) {
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; if (pts_mask[box_idx * pts_num + k] != -1) {
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
for (int k = 0; k < pts_num; k++) { unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
if (pts_mask[box_idx * pts_num + k] != -1) { unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; unsigned int z_idx = idx_encoding & 0xFF;
unsigned int x_idx = (idx_encoding >> 16) & 0xFF; unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
unsigned int y_idx = (idx_encoding >> 8) & 0xFF; y_idx * out_z * max_pts_each_voxel +
unsigned int z_idx = idx_encoding & 0xFF; z_idx * max_pts_each_voxel;
unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + unsigned int cnt = pts_idx_of_voxels[base_offset];
y_idx * out_z * max_pts_each_voxel + if (cnt < max_num_pts) {
z_idx * max_pts_each_voxel; pts_idx_of_voxels[base_offset + cnt + 1] = k;
unsigned int cnt = pts_idx_of_voxels[base_offset]; pts_idx_of_voxels[base_offset]++;
if (cnt < max_num_pts) { }
pts_idx_of_voxels[base_offset + cnt + 1] = k;
pts_idx_of_voxels[base_offset]++;
} }
} }
} }
...@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, ...@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pooled_features += box_idx * out_x * out_y * out_z * channels +
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * channels + channel_idx;
offset_base * max_pts_each_voxel; argmax += box_idx * out_x * out_y * out_z * channels +
pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
offset_base * channels + channel_idx;
argmax += box_idx * out_x * out_y * out_z * channels + int argmax_idx = -1;
offset_base * channels + channel_idx; float max_val = -1e50;
int argmax_idx = -1; int total_pts = pts_idx_of_voxels[0];
float max_val = -1e50;
for (int k = 1; k <= total_pts; k++) {
int total_pts = pts_idx_of_voxels[0]; if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
max_val) {
for (int k = 1; k <= total_pts; k++) { max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) { argmax_idx = pts_idx_of_voxels[k];
max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; }
argmax_idx = pts_idx_of_voxels[k];
} }
}
if (argmax_idx != -1) { if (argmax_idx != -1) {
pooled_features[0] = max_val; pooled_features[0] = max_val;
}
argmax[0] = argmax_idx;
} }
argmax[0] = argmax_idx;
} }
template <typename T> template <typename T>
...@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, ...@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pooled_features += box_idx * out_x * out_y * out_z * channels +
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * channels + channel_idx;
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels + float sum_val = 0;
offset_base * channels + channel_idx; int total_pts = pts_idx_of_voxels[0];
float sum_val = 0; for (int k = 1; k <= total_pts; k++) {
int total_pts = pts_idx_of_voxels[0]; sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
for (int k = 1; k <= total_pts; k++) {
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
if (total_pts > 0) { if (total_pts > 0) {
pooled_features[0] = sum_val / total_pts; pooled_features[0] = sum_val / total_pts;
}
} }
} }
...@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, ...@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; argmax += box_idx * out_x * out_y * out_z * channels +
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx; offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
if (argmax[0] == -1) return; if (argmax[0] == -1) return;
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
}
} }
template <typename T> template <typename T>
...@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, ...@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; grad_out += box_idx * out_x * out_y * out_z * channels +
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * channels + channel_idx;
offset_base * max_pts_each_voxel;
grad_out += box_idx * out_x * out_y * out_z * channels + int total_pts = pts_idx_of_voxels[0];
offset_base * channels + channel_idx; float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
for (int k = 1; k <= total_pts; k++) {
int total_pts = pts_idx_of_voxels[0]; atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
float cur_grad = 1 / fmaxf(float(total_pts), 1.0); grad_out[0] * cur_grad);
for (int k = 1; k <= total_pts; k++) { }
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
grad_out[0] * cur_grad);
} }
} }
......
...@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, ...@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
// params boxes3d: (B, M, 7) // params boxes3d: (B, M, 7)
// params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
// background points // background points
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y; int box_idx = blockIdx.y;
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (box_idx >= boxes_num || bs_idx >= batch_size) return;
if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) { int assign_idx =
return; bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
} pts_assign[assign_idx] = 0;
int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
pts_assign[assign_idx] = 0;
int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
local_x, local_y); local_x, local_y);
pts_assign[assign_idx] = cur_in_flag; pts_assign[assign_idx] = cur_in_flag;
}
} }
__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
...@@ -69,35 +69,32 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, ...@@ -69,35 +69,32 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
// params pts_assign: (B, N) // params pts_assign: (B, N)
// params pts_idx: (B, M, 512) // params pts_idx: (B, M, 512)
// params pooled_empty_flag: (B, M) // params pooled_empty_flag: (B, M)
CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x; int bs_idx = blockIdx.y;
if (boxes_idx >= boxes_num) {
return; int cnt = 0;
} for (int k = 0; k < pts_num; k++) {
if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
int bs_idx = blockIdx.y; boxes_idx]) {
if (cnt < sampled_pts_num) {
int cnt = 0; pts_idx[bs_idx * boxes_num * sampled_pts_num +
for (int k = 0; k < pts_num; k++) { boxes_idx * sampled_pts_num + cnt] = k;
if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) { cnt++;
if (cnt < sampled_pts_num) { } else
pts_idx[bs_idx * boxes_num * sampled_pts_num + break;
boxes_idx * sampled_pts_num + cnt] = k; }
cnt++;
} else
break;
} }
}
if (cnt == 0) { if (cnt == 0) {
pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
} else if (cnt < sampled_pts_num) { } else if (cnt < sampled_pts_num) {
// duplicate same points for sampling // duplicate same points for sampling
for (int k = cnt; k < sampled_pts_num; k++) { for (int k = cnt; k < sampled_pts_num; k++) {
int duplicate_idx = k % cnt; int duplicate_idx = k % cnt;
int base_offset = int base_offset =
bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
}
} }
} }
} }
...@@ -112,33 +109,26 @@ __global__ void roipoint_pool3d_forward( ...@@ -112,33 +109,26 @@ __global__ void roipoint_pool3d_forward(
// params pts_feature: (B, N, C) // params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C) // params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M) // params pooled_empty_flag: (B, M)
int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y; int box_idx = blockIdx.y;
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || if (box_idx >= boxes_num || bs_idx >= batch_size) return;
bs_idx >= batch_size) { if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
return;
} int temp_idx = bs_idx * boxes_num * sampled_pts_num +
box_idx * sampled_pts_num + sample_pt_idx;
if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) { int src_pt_idx = pts_idx[temp_idx];
return; int dst_feature_offset = temp_idx * (3 + feature_in_len);
for (int j = 0; j < 3; j++)
pooled_features[dst_feature_offset + j] =
xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
int src_feature_offset =
bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
memcpy(pooled_features + dst_feature_offset + 3,
pts_feature + src_feature_offset, feature_in_len * sizeof(T));
} }
int temp_idx = bs_idx * boxes_num * sampled_pts_num +
box_idx * sampled_pts_num + sample_pt_idx;
int src_pt_idx = pts_idx[temp_idx];
int dst_feature_offset = temp_idx * (3 + feature_in_len);
for (int j = 0; j < 3; j++)
pooled_features[dst_feature_offset + j] =
xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
int src_feature_offset =
bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
memcpy(pooled_features + dst_feature_offset + 3,
pts_feature + src_feature_offset, feature_in_len * sizeof(T));
} }
#endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH #endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
...@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel( ...@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, n) {
if (bs_idx >= b || c_idx >= c) return;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; weight += bs_idx * n * 3 + pt_idx * 3;
points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
out += bs_idx * c * n + c_idx * n;
weight += bs_idx * n * 3 + pt_idx * 3; out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
points += bs_idx * c * m + c_idx * m; weight[2] * points[idx[2]];
idx += bs_idx * n * 3 + pt_idx * 3; }
out += bs_idx * c * n + c_idx * n;
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
weight[2] * points[idx[2]];
} }
template <typename T> template <typename T>
...@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel( ...@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, n) {
if (bs_idx >= b || c_idx >= c) return;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
grad_out += bs_idx * c * n + c_idx * n + pt_idx;
grad_out += bs_idx * c * n + c_idx * n + pt_idx; weight += bs_idx * n * 3 + pt_idx * 3;
weight += bs_idx * n * 3 + pt_idx * 3; grad_points += bs_idx * c * m + c_idx * m;
grad_points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3;
idx += bs_idx * n * 3 + pt_idx * 3;
atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); }
} }
#endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH #endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH
...@@ -19,48 +19,49 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m, ...@@ -19,48 +19,49 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
// idx: (B, N, 3) // idx: (B, N, 3)
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, n) {
if (bs_idx >= b || pt_idx >= n) return; if (bs_idx >= b) return;
unknown += bs_idx * n * 3 + pt_idx * 3; unknown += bs_idx * n * 3 + pt_idx * 3;
known += bs_idx * m * 3; known += bs_idx * m * 3;
dist2 += bs_idx * n * 3 + pt_idx * 3; dist2 += bs_idx * n * 3 + pt_idx * 3;
idx += bs_idx * n * 3 + pt_idx * 3; idx += bs_idx * n * 3 + pt_idx * 3;
T ux = unknown[0]; T ux = unknown[0];
T uy = unknown[1]; T uy = unknown[1];
T uz = unknown[2]; T uz = unknown[2];
double best1 = 1e40, best2 = 1e40, best3 = 1e40; double best1 = 1e40, best2 = 1e40, best3 = 1e40;
int besti1 = 0, besti2 = 0, besti3 = 0; int besti1 = 0, besti2 = 0, besti3 = 0;
for (int k = 0; k < m; ++k) { for (int k = 0; k < m; ++k) {
T x = known[k * 3 + 0]; T x = known[k * 3 + 0];
T y = known[k * 3 + 1]; T y = known[k * 3 + 1];
T z = known[k * 3 + 2]; T z = known[k * 3 + 2];
T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
if (d < best1) { if (d < best1) {
best3 = best2; best3 = best2;
besti3 = besti2; besti3 = besti2;
best2 = best1; best2 = best1;
besti2 = besti1; besti2 = besti1;
best1 = d; best1 = d;
besti1 = k; besti1 = k;
} else if (d < best2) { } else if (d < best2) {
best3 = best2; best3 = best2;
besti3 = besti2; besti3 = besti2;
best2 = d; best2 = d;
besti2 = k; besti2 = k;
} else if (d < best3) { } else if (d < best3) {
best3 = d; best3 = d;
besti3 = k; besti3 = k;
}
} }
dist2[0] = best1;
dist2[1] = best2;
dist2[2] = best3;
idx[0] = besti1;
idx[1] = besti2;
idx[2] = besti3;
} }
dist2[0] = best1;
dist2[1] = best2;
dist2[2] = best3;
idx[0] = besti1;
idx[1] = besti2;
idx[2] = besti3;
} }
#endif // THREE_NN_CUDA_KERNEL_CUH #endif // THREE_NN_CUDA_KERNEL_CUH
...@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor, ...@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
CUDA_1D_KERNEL_LOOP(index, num_points) { CUDA_1D_KERNEL_LOOP(index, num_points) {
auto coor_offset = coor + index * NDim; auto coor_offset = coor + index * NDim;
// skip invalid points // skip invalid points
if ((index >= num_points) || (coor_offset[0] == -1)) return; if (coor_offset[0] == -1) return;
int num = 0; int num = 0;
int coor_x = coor_offset[0]; int coor_x = coor_offset[0];
......
...@@ -6,8 +6,6 @@ ...@@ -6,8 +6,6 @@
using namespace at; using namespace at;
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \ #define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) \ #define CHECK_CPU(x) \
......
...@@ -73,8 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -73,8 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>(); int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>(); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask = Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data = unsigned long long *mask_data =
......
...@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher( ...@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(points.device()); at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK)); dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher( ...@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(grad_out.device()); at::cuda::CUDAGuard device_guard(grad_out.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK)); dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
dim3 threads1(THREADS_PER_BLOCK); dim3 threads1(THREADS_PER_BLOCK);
dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK)); dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
dim3 threads2(THREADS_PER_BLOCK); dim3 threads2(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius, ...@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -16,7 +16,7 @@ void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints, ...@@ -16,7 +16,7 @@ void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -43,7 +43,7 @@ void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints, ...@@ -43,7 +43,7 @@ void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment