[Refactor] Replace DIVUP with GET_BLOCKS (#1586)

* [Improve] migrating DIVUP to GET_BLOCKS * [Fix] use GET_BLOCKS only for block alloc and del useless statements * [Fix] add kernel loop for nms and del useless statements

[Refactor] Replace DIVUP with GET_BLOCKS (#1586)
* [Improve] migrating DIVUP to GET_BLOCKS * [Fix] use GET_BLOCKS only for block alloc and del useless statements * [Fix] add kernel loop for nms and del useless statements
b586cc2f · Jiazhen Wang · GitHub · cf754db9 · b586cc2f · b586cc2f
Unverified Commit b586cc2f authored Jan 08, 2022 by Jiazhen Wang Committed by GitHub Jan 08, 2022
15 changed files
--- a/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -21,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
-              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);

  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
@@ -41,8 +41,8 @@ void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
-              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);

  iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
@@ -58,8 +58,8 @@ void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);

  nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
@@ -75,8 +75,8 @@ void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
  dim3 threads(THREADS_PER_BLOCK_NMS);

  nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(

--- a/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -13,10 +13,11 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  auto boxes_sorted = boxes.index_select(0, order_t);

  int boxes_num = boxes.size(0);
-  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  dim3 blocks(col_blocks, col_blocks);
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
  dim3 threads(threadsPerBlock);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  nms_cuda<<<blocks, threads, 0, stream>>>(

--- a/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
  at::cuda::CUDAGuard device_guard(boxes.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
  Tensor pts_mask =
      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));

-  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(

  // TODO: Merge the collect and pool functions, SS

-  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));

  AT_DISPATCH_INTEGRAL_TYPES(
      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher(

  AT_CUDA_CHECK(cudaGetLastError());

-  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
-                   boxes_num);
+  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
+                   channels, boxes_num);
  if (pool_method == 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher(
  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
              boxes_num);
  dim3 threads(THREADS_PER_BLOCK);


--- a/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
                             boxes3d.options().dtype(at::kInt));

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);

  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
      batch_size, pts_num, boxes_num, sampled_pts_num,
      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
      pooled_empty_flag.data_ptr<int>());

-  dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
                   batch_size);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/iou3d.cpp
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -73,7 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  int64_t *keep_data = keep.data_ptr<int64_t>();
  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;

  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
@@ -117,7 +118,8 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  int64_t *keep_data = keep.data_ptr<int64_t>();
  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;

  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
@@ -85,7 +85,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
    case 0:
    case 1:
      nthreads = batch_size * channels * width;
-      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
      top_bottom_pool_kernel<scalar_t>
          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
              input, output, batch_size, channels, height, width, pool_type);
@@ -93,7 +93,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
    case 2:
    case 3:
      nthreads = batch_size * channels * height;
-      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
      left_right_pool_kernel<scalar_t>
          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
              input, output, batch_size, channels, height, width, pool_type);

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
@@ -67,7 +67,7 @@ void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
  const int data_size =
      tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];

-  const int col_block = DIVUP(data_size, THREADS_PER_BLOCK);
+  const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);

  cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
      input, output_value, output_index, tensor_desc, cum_dim, cum_type);

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
@@ -114,7 +114,8 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
      mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
  size_t boxes_workspace =
      mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
  size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
                                               sizeof(unsigned long long));
  size_t index_template_workspace =
@@ -163,7 +164,8 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
                                    int spatial_dimension, int num_classes,
                                    size_t output_length, void* workspace,
                                    cudaStream_t stream) {
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
  float* boxes_sorted = (float*)workspace;
  workspace = static_cast<char*>(workspace) +
              mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));

--- a/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
@@ -67,7 +67,7 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
    num_update_indice *= indice_desc.shape[i];
  }
  // scatter
-  const int col_block = DIVUP(num_update_indice, threadsPerBlock);
+  const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
  onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
      num_update_indice, indices, update, output, tensor_desc, indice_desc);
 }

--- a/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
+++ b/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
@@ -3,8 +3,6 @@
 #define TRT_CUDA_HELPER_HPP
 #include <cublas_v2.h>

-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
 #define cudaCheckError()                                       \
  {                                                            \
    cudaError_t e = cudaGetLastError();                        \