Unverified Commit b586cc2f authored by Jiazhen Wang's avatar Jiazhen Wang Committed by GitHub
Browse files

[Refactor] Replace DIVUP with GET_BLOCKS (#1586)

* [Improve] migrating DIVUP to GET_BLOCKS

* [Fix] use GET_BLOCKS only for block alloc and del useless statements

* [Fix] add kernel loop for nms and del useless statements
parent cf754db9
......@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -21,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
......@@ -41,8 +41,8 @@ void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
......@@ -58,8 +58,8 @@ void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 threads(THREADS_PER_BLOCK_NMS);
nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
......@@ -75,8 +75,8 @@ void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 threads(THREADS_PER_BLOCK_NMS);
nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
......
......@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -13,10 +13,11 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks);
dim3 blocks(col_blocks_alloc, col_blocks_alloc);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
nms_cuda<<<blocks, threads, 0, stream>>>(
......
......@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
Tensor pts_mask =
-at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
// TODO: Merge the collect and pool functions, SS
dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
AT_DISPATCH_INTEGRAL_TYPES(
pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
......@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
AT_CUDA_CHECK(cudaGetLastError());
dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
boxes_num);
dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
channels, boxes_num);
if (pool_method == 0) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
......@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(grad_out.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
boxes_num);
dim3 threads(THREADS_PER_BLOCK);
......
......@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
boxes3d.options().dtype(at::kInt));
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);
dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
get_pooled_idx<<<blocks2, threads, 0, stream>>>(
batch_size, pts_num, boxes_num, sampled_pts_num,
pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
pooled_empty_flag.data_ptr<int>());
dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
batch_size);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -73,7 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
......@@ -117,7 +118,8 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
......
......@@ -85,7 +85,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
case 0:
case 1:
nthreads = batch_size * channels * width;
col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
top_bottom_pool_kernel<scalar_t>
<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output, batch_size, channels, height, width, pool_type);
......@@ -93,7 +93,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
case 2:
case 3:
nthreads = batch_size * channels * height;
col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
left_right_pool_kernel<scalar_t>
<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output, batch_size, channels, height, width, pool_type);
......
......@@ -67,7 +67,7 @@ void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
const int data_size =
tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
const int col_block = DIVUP(data_size, THREADS_PER_BLOCK);
const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output_value, output_index, tensor_desc, cum_dim, cum_type);
......
......@@ -114,7 +114,8 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
size_t boxes_workspace =
mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
const int col_blocks =
(spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
sizeof(unsigned long long));
size_t index_template_workspace =
......@@ -163,7 +164,8 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
int spatial_dimension, int num_classes,
size_t output_length, void* workspace,
cudaStream_t stream) {
const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
const int col_blocks =
(spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
float* boxes_sorted = (float*)workspace;
workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
......
......@@ -67,7 +67,7 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
num_update_indice *= indice_desc.shape[i];
}
// scatter
const int col_block = DIVUP(num_update_indice, threadsPerBlock);
const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
num_update_indice, indices, update, output, tensor_desc, indice_desc);
}
......
......@@ -3,8 +3,6 @@
#define TRT_CUDA_HELPER_HPP
#include <cublas_v2.h>
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define cudaCheckError() \
{ \
cudaError_t e = cudaGetLastError(); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment