[Refactor] Replace DIVUP with GET_BLOCKS (#1586)

* [Improve] migrating DIVUP to GET_BLOCKS * [Fix] use GET_BLOCKS only for block alloc and del useless statements * [Fix] add kernel loop for nms and del useless statements

[Refactor] Replace DIVUP with GET_BLOCKS (#1586)
* [Improve] migrating DIVUP to GET_BLOCKS * [Fix] use GET_BLOCKS only for block alloc and del useless statements * [Fix] add kernel loop for nms and del useless statements
b586cc2f · Jiazhen Wang · GitHub · cf754db9 · b586cc2f · b586cc2f
Unverified Commit b586cc2f authored Jan 08, 2022 by Jiazhen Wang Committed by GitHub Jan 08, 2022
20 changed files
--- a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
    const int O, const int aggregate, const T* points, const T* centers,
    const T* scores, const int64_t* knn_idx, T* output) {
  // ----- parallel loop for B, N1, K and O ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
-  if (i >= B * N1 * K * O) return;
+    // ------- loop for M ----------
-  // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
-  const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
-  const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
-  const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
-  const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
-  const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
-                              0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
-  const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
-  if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
-      kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
-    return;
+    }
-  }
+    assert(b < B);
-  assert(b < B);
+    assert(kn < N0);
-  assert(kn < N0);
+    assert(cn < N0);
-  assert(cn < N0);
+    assert(o < O);
-  assert(o < O);
+    assert(n < N1);
-  assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
-  const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
-  T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
-  for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
-    val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
-               scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
-           centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
-               scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
  }
-  output[out_idx] = val;
 }
 template <typename T>
@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
    const int O, const int aggregate, const T* grad_out, const T* scores,
    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
  // ----- parallel loop for B, M, O ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
-  if (i >= B * M * O) return;
+    int b = (int)(i / (M * O));
-  int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
-  int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);
-  int o = (int)(i % O);
-  // ----- loop for N,K ---------
+    // ----- loop for N,K ---------
-  for (int n = 0; n < N; n++) {
+    for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k++) {
+      for (int k = 0; k < K; k++) {
-      int kn = knn_idx[b * N * K + n * K + k];
+        int kn = knn_idx[b * N * K + n * K + k];
-      int cn = knn_idx[b * N * K + n * K + 0];
+        int cn = knn_idx[b * N * K + n * K + 0];
-      if (kn >= N0 ||
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
-          kn < 0) {  // if index overflows, it is out of the neighborhood range
+                                   // neighborhood range
-        continue;
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
      }
-      atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
-                scores[b * N * K * M + n * K * M + k * M + m] *
-                    grad_out[b * O * N * K + o * N * K + n * K + k]);
-      atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-                -scores[b * N * K * M + n * K * M + k * M + m] *
-                    grad_out[b * O * N * K + o * N * K + n * K + k]);
    }
  }
 }
@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
    const int O, const int aggregate, const T* grad_out, const T* points,
    const T* centers, const int64_t* knn_idx, T* grad_scores) {
  // ----- parallel loop for B, N, K, M ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
-  if (i >= B * N * K * M) return;
+    const int b = (int)(i / (N * M * K));
-  const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
-  const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
-  const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
-  const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
-  const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
-  const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
-  if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
-      kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
-    return;
+    }
-  }
-  // -------------- loop for O ------------------------
+    // -------------- loop for O ------------------------
-  const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
-  T val = grad_scores[out_idx];
+    T val = grad_scores[out_idx];
-  for (int o = 0; o < O; o++) {
+    for (int o = 0; o < O; o++) {
-    val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
-            centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
-           grad_out[b * O * N * K + o * N * K + n * K + k];
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
  }
-  grad_scores[out_idx] = val;
 }
 #endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
  // output:
  //      idx: (B, M, nsample)
  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-  if (bs_idx >= b || pt_idx >= m) return;
+    if (bs_idx >= b) return;
-  new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-  xyz += bs_idx * n * 3;
+    xyz += bs_idx * n * 3;
-  idx += bs_idx * m * nsample + pt_idx * nsample;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
-  float max_radius2 = max_radius * max_radius;
+    float max_radius2 = max_radius * max_radius;
-  float min_radius2 = min_radius * min_radius;
+    float min_radius2 = min_radius * min_radius;
-  T new_x = new_xyz[0];
+    T new_x = new_xyz[0];
-  T new_y = new_xyz[1];
+    T new_y = new_xyz[1];
-  T new_z = new_xyz[2];
+    T new_z = new_xyz[2];
-  int cnt = 0;
+    int cnt = 0;
-  for (int k = 0; k < n; ++k) {
+    for (int k = 0; k < n; ++k) {
-    T x = xyz[k * 3 + 0];
+      T x = xyz[k * 3 + 0];
-    T y = xyz[k * 3 + 1];
+      T y = xyz[k * 3 + 1];
-    T z = xyz[k * 3 + 2];
+      T z = xyz[k * 3 + 2];
-    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-           (new_z - z) * (new_z - z);
+             (new_z - z) * (new_z - z);
-    if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
-      if (cnt == 0) {
+        if (cnt == 0) {
-        for (int l = 0; l < nsample; ++l) {
+          for (int l = 0; l < nsample; ++l) {
-          idx[l] = k;
+            idx[l] = k;
+          }
        }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
      }
-      idx[cnt] = k;
-      ++cnt;
-      if (cnt >= nsample) break;
    }
  }
 }

--- a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
+++ b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -7,12 +7,20 @@
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)
-#define THREADS_PER_BLOCK 512
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
+       i += blockDim.x * gridDim.x)                                 \
+    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+         j += blockDim.y * gridDim.y)
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define THREADS_PER_BLOCK 512
-inline int GET_BLOCKS(const int N) {
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
  int max_block_num = 4096;
  return min(optimal_block_num, max_block_num);
 }

--- a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    if (bs_idx >= b || c_idx >= c) return;
-  out += bs_idx * c * m + c_idx * m + pt_idx;
+    out += bs_idx * c * m + c_idx * m + pt_idx;
-  idx += bs_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
-  points += bs_idx * c * n + c_idx * n;
+    points += bs_idx * c * n + c_idx * n;
-  out[0] = points[idx[0]];
+    out[0] = points[idx[0]];
+  }
 }
 template <typename T>
@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    if (bs_idx >= b || c_idx >= c) return;
-  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
-  idx += bs_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
-  grad_points += bs_idx * c * n + c_idx * n;
+    grad_points += bs_idx * c * n + c_idx * n;
-  atomicAdd(grad_points + idx[0], grad_out[0]);
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+  }
 }
 #endif  // GATHER_POINTS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
  //      out: (B, C, npoints, nsample)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
-  int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c) return;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
-  int sample_idx = index % nsample;
+    int pt_idx = index / nsample;
+    int sample_idx = index % nsample;
-  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-                pt_idx * nsample + sample_idx;
+                  pt_idx * nsample + sample_idx;
-  out[out_idx] = points[in_idx];
+    out[out_idx] = points[in_idx];
+  }
 }
 template <typename T>
@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
  //      grad_points: (B, C, N)
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
-  int pt_idx = index / nsample;
+    int pt_idx = index / nsample;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    if (bs_idx >= b || c_idx >= c) return;
-  int sample_idx = index % nsample;
+    int sample_idx = index % nsample;
-  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-              pt_idx * nsample + sample_idx;
+                pt_idx * nsample + sample_idx;
-  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+  }
 }
 #endif  // GROUP_POINTS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -220,16 +220,15 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
 __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
    const int num_a, const float *boxes_a, const int num_b,
    const float *boxes_b, float *ans_overlap) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
-  if (a_idx >= num_a || b_idx >= num_b) {
+    }
-    return;
+    const float *cur_box_a = boxes_a + a_idx * 5;
+    const float *cur_box_b = boxes_b + b_idx * 5;
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
  }
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float s_overlap = box_overlap(cur_box_a, cur_box_b);
-  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
 }
 __global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
@@ -237,17 +236,16 @@ __global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
                                                        const int num_b,
                                                        const float *boxes_b,
                                                        float *ans_iou) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
-  if (a_idx >= num_a || b_idx >= num_b) {
+    const float *cur_box_a = boxes_a + a_idx * 5;
-    return;
+    const float *cur_box_b = boxes_b + b_idx * 5;
+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
  }
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
-  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
 }
 __global__ void nms_forward_cuda_kernel(const int boxes_num,
@@ -256,50 +254,51 @@ __global__ void nms_forward_cuda_kernel(const int boxes_num,
                                        unsigned long long *mask) {
  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
-  const int row_start = blockIdx.y;
+    if (threadIdx.x < row_size) {
-  const int col_start = blockIdx.x;
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 5;
-  // if (row_start > col_start) return;
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
-    int i = 0;
+      int i = 0;
-    unsigned long long t = 0;
+      unsigned long long t = 0;
-    int start = 0;
+      int start = 0;
-    if (row_start == col_start) {
+      if (row_start == col_start) {
-      start = threadIdx.x + 1;
+        start = threadIdx.x + 1;
-    }
+      }
-    for (i = start; i < col_size; i++) {
+      for (i = start; i < col_size; i++) {
-      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
+          t |= 1ULL << i;
+        }
      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
    }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
  }
 }
@@ -320,49 +319,51 @@ __global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
-  const int row_start = blockIdx.y;
+  const int blocks =
-  const int col_start = blockIdx.x;
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
-  // if (row_start > col_start) return;
+    // if (row_start > col_start) return;
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
+                               THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
+                               THREADS_PER_BLOCK_NMS);
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-  if (threadIdx.x < col_size) {
+    if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
+      block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
+      block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
+      block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
+      block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
+      block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
+    }
-  __syncthreads();
+    __syncthreads();
-  if (threadIdx.x < row_size) {
+    if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
+      const float *cur_box = boxes + cur_box_idx * 5;
-    int i = 0;
+      int i = 0;
-    unsigned long long t = 0;
+      unsigned long long t = 0;
-    int start = 0;
+      int start = 0;
-    if (row_start == col_start) {
+      if (row_start == col_start) {
-      start = threadIdx.x + 1;
+        start = threadIdx.x + 1;
-    }
+      }
-    for (i = start; i < col_size; i++) {
+      for (i = start; i < col_size; i++) {
-      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
+          t |= 1ULL << i;
+        }
      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
    }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
  }
 }

--- a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
                                        const T *xyz, const T *new_xyz,
                                        int *__restrict__ idx, T *dist2) {
  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
-  if (bs_idx >= b || pt_idx >= m) return;
+    if (bs_idx >= b) return;
-  new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
-  xyz += bs_idx * n * 3;
+    xyz += bs_idx * n * 3;
-  idx += bs_idx * m * nsample + pt_idx * nsample;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
-  dist2 += bs_idx * m * nsample + pt_idx * nsample;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
-  T new_x = new_xyz[0];
+    T new_x = new_xyz[0];
-  T new_y = new_xyz[1];
+    T new_y = new_xyz[1];
-  T new_z = new_xyz[2];
+    T new_z = new_xyz[2];
-  float best_dist[100];
+    float best_dist[100];
-  int best_idx[100];
+    int best_idx[100];
-  for (int i = 0; i < nsample; i++) {
+    for (int i = 0; i < nsample; i++) {
-    best_dist[i] = 1e10;
+      best_dist[i] = 1e10;
-    best_idx[i] = 0;
+      best_idx[i] = 0;
-  }
+    }
-  for (int i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
-    T x = xyz[i * 3 + 0];
+      T x = xyz[i * 3 + 0];
-    T y = xyz[i * 3 + 1];
+      T y = xyz[i * 3 + 1];
-    T z = xyz[i * 3 + 2];
+      T z = xyz[i * 3 + 2];
-    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-           (new_z - z) * (new_z - z);
+             (new_z - z) * (new_z - z);
-    if (d2 < best_dist[0]) {
+      if (d2 < best_dist[0]) {
-      best_dist[0] = d2;
+        best_dist[0] = d2;
-      best_idx[0] = i;
+        best_idx[0] = i;
-      reheap(best_dist, best_idx, nsample);
+        reheap(best_dist, best_idx, nsample);
+      }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for (int i = 0; i < nsample; i++) {
+      idx[i] = best_idx[i];
+      dist2[i] = best_dist[i];
    }
-  }
-  heap_sort(best_dist, best_idx, nsample);
-  for (int i = 0; i < nsample; i++) {
-    idx[i] = best_idx[i];
-    dist2[i] = best_dist[i];
  }
 }

--- a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -15,9 +15,6 @@
 #include "pytorch_cuda_helper.hpp"
 const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads) {
-  return (N + num_threads - 1) / num_threads;
-}
 template <typename scalar_t>
 __device__ scalar_t ms_deform_attn_im2col_bilinear(

--- a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -30,45 +30,46 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
 __global__ void nms_cuda(const int n_boxes, const float iou_threshold,
                         const int offset, const float *dev_boxes,
                         unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
-  const int col_start = blockIdx.x;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
-  const int tid = threadIdx.x;
+    const int tid = threadIdx.x;
-  if (row_start > col_start) return;
+    if (row_start > col_start) return;
-  const int row_size =
+    const int row_size =
-      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
+    const int col_size =
-      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-  __shared__ float block_boxes[threadsPerBlock * 4];
+    __shared__ float block_boxes[threadsPerBlock * 4];
-  if (tid < col_size) {
+    if (tid < col_size) {
-    block_boxes[tid * 4 + 0] =
+      block_boxes[tid * 4 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
-    block_boxes[tid * 4 + 1] =
+      block_boxes[tid * 4 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
-    block_boxes[tid * 4 + 2] =
+      block_boxes[tid * 4 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
-    block_boxes[tid * 4 + 3] =
+      block_boxes[tid * 4 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
-  }
-  __syncthreads();
-  if (tid < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + tid;
-    const float *cur_box = dev_boxes + cur_box_idx * 4;
-    int i = 0;
-    unsigned long long int t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = tid + 1;
    }
-    for (i = start; i < col_size; i++) {
+    __syncthreads();
-      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
-        t |= 1ULL << i;
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const float *cur_box = dev_boxes + cur_box_idx * 4;
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
    }
-    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
  }
 }
 #endif  // NMS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
  // (B, npoints), default -1
  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+    if (bs_idx >= batch_size) return;
-  boxes += bs_idx * boxes_num * 7;
+    boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num + pt_idx;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
-  T local_x = 0, local_y = 0;
+    T local_x = 0, local_y = 0;
-  int cur_in_flag = 0;
+    int cur_in_flag = 0;
-  for (int k = 0; k < boxes_num; k++) {
+    for (int k = 0; k < boxes_num; k++) {
-    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
+      if (cur_in_flag) {
-      box_idx_of_points[0] = k;
+        box_idx_of_points[0] = k;
-      break;
+        break;
+      }
    }
  }
 }
@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
  // (B, npoints), default -1
  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+    if (bs_idx >= batch_size) return;
-  boxes += bs_idx * boxes_num * 7;
+    boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
-  T local_x = 0, local_y = 0;
+    T local_x = 0, local_y = 0;
-  for (int k = 0; k < boxes_num; k++) {
+    for (int k = 0; k < boxes_num; k++) {
-    const int cur_in_flag =
+      const int cur_in_flag =
-        check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
+      if (cur_in_flag) {
-      box_idx_of_points[k] = 1;
+        box_idx_of_points[k] = 1;
+      }
    }
  }
 }

--- a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
  // y_idxs, z_idxs) by binary bit
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int box_idx = blockIdx.y;
-  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num) return;
-  pts += pt_idx * 3;
+    pts += pt_idx * 3;
-  rois += box_idx * 7;
+    rois += box_idx * 7;
-  pts_mask += box_idx * pts_num + pt_idx;
+    pts_mask += box_idx * pts_num + pt_idx;
-  T local_x = 0, local_y = 0;
+    T local_x = 0, local_y = 0;
-  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
-  pts_mask[0] = -1;
+    pts_mask[0] = -1;
-  if (cur_in_flag > 0) {
+    if (cur_in_flag > 0) {
-    T local_z = pts[2] - rois[2];
+      T local_z = pts[2] - rois[2];
-    T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
-    T x_res = x_size / out_x;
+      T x_res = x_size / out_x;
-    T y_res = y_size / out_y;
+      T y_res = y_size / out_y;
-    T z_res = z_size / out_z;
+      T z_res = z_size / out_z;
-    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
-    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
-    unsigned int z_idx = int(local_z / z_res);
+      unsigned int z_idx = int(local_z / z_res);
-    x_idx = min(max(x_idx, 0), out_x - 1);
+      x_idx = min(max(x_idx, 0), out_x - 1);
-    y_idx = min(max(y_idx, 0), out_y - 1);
+      y_idx = min(max(y_idx, 0), out_y - 1);
-    z_idx = min(max(z_idx, 0), out_z - 1);
+      z_idx = min(max(z_idx, 0), out_z - 1);
-    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
-    pts_mask[0] = idx_encoding;
+      pts_mask[0] = idx_encoding;
+    }
  }
 }
@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
                                             T *pts_idx_of_voxels) {
  // params pts_mask: (N, npoints)  0 or 1
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
-  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-  if (box_idx >= boxes_num) return;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    for (int k = 0; k < pts_num; k++) {
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+      if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-  for (int k = 0; k < pts_num; k++) {
+        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-    if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+        unsigned int z_idx = idx_encoding & 0xFF;
-      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
-      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+                                   y_idx * out_z * max_pts_each_voxel +
-      unsigned int z_idx = idx_encoding & 0xFF;
+                                   z_idx * max_pts_each_voxel;
-      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+        unsigned int cnt = pts_idx_of_voxels[base_offset];
-                                 y_idx * out_z * max_pts_each_voxel +
+        if (cnt < max_num_pts) {
-                                 z_idx * max_pts_each_voxel;
+          pts_idx_of_voxels[base_offset + cnt + 1] = k;
-      unsigned int cnt = pts_idx_of_voxels[base_offset];
+          pts_idx_of_voxels[base_offset]++;
-      if (cnt < max_num_pts) {
+        }
-        pts_idx_of_voxels[base_offset + cnt + 1] = k;
-        pts_idx_of_voxels[base_offset]++;
      }
    }
  }
@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
-  int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
-  int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    return;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * channels + channel_idx;
-                       offset_base * max_pts_each_voxel;
+    argmax += box_idx * out_x * out_y * out_z * channels +
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
-                     offset_base * channels + channel_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
+    int argmax_idx = -1;
-            offset_base * channels + channel_idx;
+    float max_val = -1e50;
-  int argmax_idx = -1;
+    int total_pts = pts_idx_of_voxels[0];
-  float max_val = -1e50;
+    for (int k = 1; k <= total_pts; k++) {
-  int total_pts = pts_idx_of_voxels[0];
+      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
+          max_val) {
-  for (int k = 1; k <= total_pts; k++) {
+        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
+        argmax_idx = pts_idx_of_voxels[k];
-      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+      }
-      argmax_idx = pts_idx_of_voxels[k];
    }
-  }
-  if (argmax_idx != -1) {
+    if (argmax_idx != -1) {
-    pooled_features[0] = max_val;
+      pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
  }
-  argmax[0] = argmax_idx;
 }
 template <typename T>
@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
-  int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
-  int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    return;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * channels + channel_idx;
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
+    float sum_val = 0;
-                     offset_base * channels + channel_idx;
+    int total_pts = pts_idx_of_voxels[0];
-  float sum_val = 0;
+    for (int k = 1; k <= total_pts; k++) {
-  int total_pts = pts_idx_of_voxels[0];
+      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
-  for (int k = 1; k <= total_pts; k++) {
-    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-  }
-  if (total_pts > 0) {
+    if (total_pts > 0) {
-    pooled_features[0] = sum_val / total_pts;
+      pooled_features[0] = sum_val / total_pts;
+    }
  }
 }
@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
-  int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
-  int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    return;
+    argmax += box_idx * out_x * out_y * out_z * channels +
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
              offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
-  if (argmax[0] == -1) return;
+    if (argmax[0] == -1) return;
-  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+  }
 }
 template <typename T>
@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
  int box_idx = blockIdx.z;
  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
-  int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
-  int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    return;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                offset_base * channels + channel_idx;
-                       offset_base * max_pts_each_voxel;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
+    int total_pts = pts_idx_of_voxels[0];
-              offset_base * channels + channel_idx;
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++) {
-  int total_pts = pts_idx_of_voxels[0];
+      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
-  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+                grad_out[0] * cur_grad);
-  for (int k = 1; k <= total_pts; k++) {
+    }
-    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
-              grad_out[0] * cur_grad);
  }
 }

--- a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
  // params boxes3d: (B, M, 7)
  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
  // background points
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
-  if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
+    int assign_idx =
-    return;
+        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-  }
+    pts_assign[assign_idx] = 0;
-  int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-  pts_assign[assign_idx] = 0;
-  int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
-  int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
-  T local_x = 0, local_y = 0;
+    T local_x = 0, local_y = 0;
-  int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
-                                      local_x, local_y);
+                                        local_x, local_y);
-  pts_assign[assign_idx] = cur_in_flag;
+    pts_assign[assign_idx] = cur_in_flag;
+  }
 }
 __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
@@ -69,35 +69,32 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
  // params pts_assign: (B, N)
  // params pts_idx: (B, M, 512)
  // params pooled_empty_flag: (B, M)
+  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
-  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int bs_idx = blockIdx.y;
-  if (boxes_idx >= boxes_num) {
-    return;
+    int cnt = 0;
-  }
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
-  int bs_idx = blockIdx.y;
+                     boxes_idx]) {
+        if (cnt < sampled_pts_num) {
-  int cnt = 0;
+          pts_idx[bs_idx * boxes_num * sampled_pts_num +
-  for (int k = 0; k < pts_num; k++) {
+                  boxes_idx * sampled_pts_num + cnt] = k;
-    if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) {
+          cnt++;
-      if (cnt < sampled_pts_num) {
+        } else
-        pts_idx[bs_idx * boxes_num * sampled_pts_num +
+          break;
-                boxes_idx * sampled_pts_num + cnt] = k;
+      }
-        cnt++;
-      } else
-        break;
    }
-  }
-  if (cnt == 0) {
+    if (cnt == 0) {
-    pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
-  } else if (cnt < sampled_pts_num) {
+    } else if (cnt < sampled_pts_num) {
-    // duplicate same points for sampling
+      // duplicate same points for sampling
-    for (int k = cnt; k < sampled_pts_num; k++) {
+      for (int k = cnt; k < sampled_pts_num; k++) {
-      int duplicate_idx = k % cnt;
+        int duplicate_idx = k % cnt;
-      int base_offset =
+        int base_offset =
-          bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
-      pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+      }
    }
  }
 }
@@ -112,33 +109,26 @@ __global__ void roipoint_pool3d_forward(
  // params pts_feature: (B, N, C)
  // params pooled_features: (B, M, 512, 3+C)
  // params pooled_empty_flag: (B, M)
-  int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
-  if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
-      bs_idx >= batch_size) {
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
-    return;
-  }
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                   box_idx * sampled_pts_num + sample_pt_idx;
-  if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
+    int src_pt_idx = pts_idx[temp_idx];
-    return;
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+    for (int j = 0; j < 3; j++)
+      pooled_features[dst_feature_offset + j] =
+          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+    int src_feature_offset =
+        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    memcpy(pooled_features + dst_feature_offset + 3,
+           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
  }
-  int temp_idx = bs_idx * boxes_num * sampled_pts_num +
-                 box_idx * sampled_pts_num + sample_pt_idx;
-  int src_pt_idx = pts_idx[temp_idx];
-  int dst_feature_offset = temp_idx * (3 + feature_in_len);
-  for (int j = 0; j < 3; j++)
-    pooled_features[dst_feature_offset + j] =
-        xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
-  int src_feature_offset =
-      bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
-  memcpy(pooled_features + dst_feature_offset + 3,
-         pts_feature + src_feature_offset, feature_in_len * sizeof(T));
 }
 #endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
-  weight += bs_idx * n * 3 + pt_idx * 3;
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-  points += bs_idx * c * m + c_idx * m;
+                  weight[2] * points[idx[2]];
-  idx += bs_idx * n * 3 + pt_idx * 3;
+  }
-  out += bs_idx * c * n + c_idx * n;
-  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-                weight[2] * points[idx[2]];
 }
 template <typename T>
@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(
  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
-  weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
-  grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
-  idx += bs_idx * n * 3 + pt_idx * 3;
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
-  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  }
 }
 #endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -19,48 +19,49 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
  //      idx: (B, N, 3)
  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-  if (bs_idx >= b || pt_idx >= n) return;
+    if (bs_idx >= b) return;
-  unknown += bs_idx * n * 3 + pt_idx * 3;
+    unknown += bs_idx * n * 3 + pt_idx * 3;
-  known += bs_idx * m * 3;
+    known += bs_idx * m * 3;
-  dist2 += bs_idx * n * 3 + pt_idx * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
-  idx += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
-  T ux = unknown[0];
+    T ux = unknown[0];
-  T uy = unknown[1];
+    T uy = unknown[1];
-  T uz = unknown[2];
+    T uz = unknown[2];
-  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-  int besti1 = 0, besti2 = 0, besti3 = 0;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
-  for (int k = 0; k < m; ++k) {
+    for (int k = 0; k < m; ++k) {
-    T x = known[k * 3 + 0];
+      T x = known[k * 3 + 0];
-    T y = known[k * 3 + 1];
+      T y = known[k * 3 + 1];
-    T z = known[k * 3 + 2];
+      T z = known[k * 3 + 2];
-    T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-    if (d < best1) {
+      if (d < best1) {
-      best3 = best2;
+        best3 = best2;
-      besti3 = besti2;
+        besti3 = besti2;
-      best2 = best1;
+        best2 = best1;
-      besti2 = besti1;
+        besti2 = besti1;
-      best1 = d;
+        best1 = d;
-      besti1 = k;
+        besti1 = k;
-    } else if (d < best2) {
+      } else if (d < best2) {
-      best3 = best2;
+        best3 = best2;
-      besti3 = besti2;
+        besti3 = besti2;
-      best2 = d;
+        best2 = d;
-      besti2 = k;
+        besti2 = k;
-    } else if (d < best3) {
+      } else if (d < best3) {
-      best3 = d;
+        best3 = d;
-      besti3 = k;
+        besti3 = k;
+      }
    }
+    dist2[0] = best1;
+    dist2[1] = best2;
+    dist2[2] = best3;
+    idx[0] = besti1;
+    idx[1] = besti2;
+    idx[2] = besti3;
  }
-  dist2[0] = best1;
-  dist2[1] = best2;
-  dist2[2] = best3;
-  idx[0] = besti1;
-  idx[1] = besti2;
-  idx[2] = besti3;
 }
 #endif  // THREE_NN_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
  CUDA_1D_KERNEL_LOOP(index, num_points) {
    auto coor_offset = coor + index * NDim;
    // skip invalid points
-    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+    if (coor_offset[0] == -1) return;
    int num = 0;
    int coor_x = coor_offset[0];

--- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -6,8 +6,6 @@
 using namespace at;
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 #define CHECK_CUDA(x) \
  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CPU(x) \

--- a/mmcv/ops/csrc/parrots/iou3d.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -73,8 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  int64_t *keep_data = keep.data_ptr<int64_t>();
  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  unsigned long long *mask_data =

--- a/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
  at::cuda::CUDAGuard device_guard(points.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
  dim3 threads(THREADS_PER_BLOCK);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
  at::cuda::CUDAGuard device_guard(grad_out.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK));
+  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
  dim3 threads1(THREADS_PER_BLOCK);
-  dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
  dim3 threads2(THREADS_PER_BLOCK);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -16,7 +16,7 @@ void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -43,7 +43,7 @@ void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(