Reformat cpp code to supress warning

ee5667c1 · zhangwenwei · 2bb43004 · ee5667c1 · ee5667c1 · ee5667c1
Commit ee5667c1 authored Apr 27, 2020 by zhangwenwei
15 changed files
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
                                                  out)
        return result_files
-    def evaluate(self, result_files, eval_types=None):
+    def evaluate(self, result_files, logger=None, eval_types=None):
        from mmdet3d.core.evaluation import kitti_eval
        gt_annos = [info['annos'] for info in self.kitti_infos]
        if eval_types == 'img_bbox':
@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
        else:
            ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
                                                self.class_names)
-        return ap_result_str, ap_dict
+        return ap_dict
    def bbox2result_kitti(self, net_outputs, class_names, out=None):
        if out:

--- a/mmdet3d/models/anchor_heads/second_head.py
+++ b/mmdet3d/models/anchor_heads/second_head.py
@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
 @HEADS.register_module
 class SECONDHead(nn.Module, AnchorTrainMixin):
-    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+    """Anchor-based head for VoxelNet detectors.
    Args:
+        class_name (list[str]): name of classes (TODO: to be removed)
        in_channels (int): Number of channels in the input feature map.
+        train_cfg (dict): train configs
+        test_cfg (dict): test configs
        feat_channels (int): Number of channels of the feature map.
+        use_direction_classifier (bool): Whether to add a direction classifier.
+        encode_bg_as_zeros (bool): Whether to use sigmoid of softmax
+            (TODO: to be removed)
+        box_code_size (int): The size of box code.
+        anchor_generator(dict): Config dict of anchor generator.
+        assigner_per_size (bool): Whether to do assignment for each separate
+            anchor size.
+        assign_per_class (bool): Whether to do assignment for each class.
+        diff_rad_by_sin (bool): Whether to change the difference into sin
+            difference for box regression loss.
+        dir_offset (float | int): The offset of BEV rotation angles
+            (TODO: may be moved into box coder)
+        dirlimit_offset (float | int): The limited range of BEV rotation angles
+            (TODO: may be moved into box coder)
+        box_coder (dict): Config dict of box coders.
        loss_cls (dict): Config of classification loss.
        loss_bbox (dict): Config of localization loss.
+        loss_dir (dict): Config of direction classifier loss.
    """  # noqa: W605
    def __init__(self,
@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
        num_levels = len(cls_scores)
        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
        device = cls_scores[0].device
-        mlvl_anchors = self.anchor_generators.grid_anchors(
+        mlvl_anchors = self.anchor_generator.grid_anchors(
            featmap_sizes, device=device)
        mlvl_anchors = [
            anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors

--- a/mmdet3d/ops/iou3d/src/iou3d.cpp
+++ b/mmdet3d/ops/iou3d/src/iou3d.cpp
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
-#include <vector>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#include <vector>
-#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+#define CHECK_CUDA(x) \
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-{
+#define CHECK_CONTIGUOUS(x) \
-   if (code != cudaSuccess)
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-   {
+#define CHECK_INPUT(x) \
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define CHECK_ERROR(ans) \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
    if (abort) exit(code);
  }
 }
 const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+void boxesoverlapLauncher(const int num_a, const float *boxes_a,
-void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap);
+                          const int num_b, const float *boxes_b,
-void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou);
+                          float *ans_overlap);
-void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh);
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
-void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh);
+                         const float *boxes_b, float *ans_iou);
+void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
-int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){
+                 float nms_overlap_thresh);
+void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
+                       int boxes_num, float nms_overlap_thresh);
+int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
+                          at::Tensor ans_overlap) {
  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
@@ -40,16 +51,18 @@ int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans
  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);
-    const float * boxes_a_data = boxes_a.data<float>();
+  const float *boxes_a_data = boxes_a.data_ptr<float>();
-    const float * boxes_b_data = boxes_b.data<float>();
+  const float *boxes_b_data = boxes_b.data_ptr<float>();
-    float * ans_overlap_data = ans_overlap.data<float>();
+  float *ans_overlap_data = ans_overlap.data_ptr<float>();
-    boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data);
+  boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
+                       ans_overlap_data);
  return 1;
 }
-int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){
+int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
+                      at::Tensor ans_iou) {
  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
@@ -61,16 +74,16 @@ int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou
  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);
-    const float * boxes_a_data = boxes_a.data<float>();
+  const float *boxes_a_data = boxes_a.data_ptr<float>();
-    const float * boxes_b_data = boxes_b.data<float>();
+  const float *boxes_b_data = boxes_b.data_ptr<float>();
-    float * ans_iou_data = ans_iou.data<float>();
+  float *ans_iou_data = ans_iou.data_ptr<float>();
  boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
  return 1;
 }
-int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
+int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh) {
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
@@ -78,21 +91,24 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
  CHECK_CONTIGUOUS(keep);
  int boxes_num = boxes.size(0);
-    const float * boxes_data = boxes.data<float>();
+  const float *boxes_data = boxes.data_ptr<float>();
-    long * keep_data = keep.data<long>();
+  long *keep_data = keep.data_ptr<long>();
  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
  unsigned long long *mask_data = NULL;
-    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long)));
+  CHECK_ERROR(cudaMalloc((void **)&mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long)));
  nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
  // unsigned long long mask_cpu[boxes_num * col_blocks];
-    // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
+  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
+  // col_blocks];
  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
-//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
-    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long),
+  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long),
                         cudaMemcpyDeviceToHost));
  cudaFree(mask_data);
@@ -102,25 +118,25 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
  int num_to_keep = 0;
-    for (int i = 0; i < boxes_num; i++){
+  for (int i = 0; i < boxes_num; i++) {
    int nblock = i / THREADS_PER_BLOCK_NMS;
    int inblock = i % THREADS_PER_BLOCK_NMS;
-        if (!(remv_cpu[nblock] & (1ULL << inblock))){
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
      keep_data[num_to_keep++] = i;
      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
-            for (int j = nblock; j < col_blocks; j++){
+      for (int j = nblock; j < col_blocks; j++) {
        remv_cpu[j] |= p[j];
      }
    }
  }
-    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
  return num_to_keep;
 }
+int nms_normal_gpu(at::Tensor boxes, at::Tensor keep,
-int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
+                   float nms_overlap_thresh) {
  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
  // params keep: (N)
@@ -128,21 +144,24 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
  CHECK_CONTIGUOUS(keep);
  int boxes_num = boxes.size(0);
-    const float * boxes_data = boxes.data<float>();
+  const float *boxes_data = boxes.data_ptr<float>();
-    long * keep_data = keep.data<long>();
+  long *keep_data = keep.data_ptr<long>();
  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
  unsigned long long *mask_data = NULL;
-    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long)));
+  CHECK_ERROR(cudaMalloc((void **)&mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long)));
  nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
  // unsigned long long mask_cpu[boxes_num * col_blocks];
-    // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
+  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
+  // col_blocks];
  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
-//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
-    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long),
+  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long),
                         cudaMemcpyDeviceToHost));
  cudaFree(mask_data);
@@ -152,27 +171,26 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
  int num_to_keep = 0;
-    for (int i = 0; i < boxes_num; i++){
+  for (int i = 0; i < boxes_num; i++) {
    int nblock = i / THREADS_PER_BLOCK_NMS;
    int inblock = i % THREADS_PER_BLOCK_NMS;
-        if (!(remv_cpu[nblock] & (1ULL << inblock))){
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
      keep_data[num_to_keep++] = i;
      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
-            for (int j = nblock; j < col_blocks; j++){
+      for (int j = nblock; j < col_blocks; j++) {
        remv_cpu[j] |= p[j];
      }
    }
  }
-    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
  return num_to_keep;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap");
+  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu,
+        "oriented boxes overlap");
  m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
  m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
  m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");

--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
-//Modified from
+// Modified from
-//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//Points in boxes cpu
+// Points in boxes cpu
-//Written by Shaoshuai Shi
+// Written by Shaoshuai Shi
-//All Rights Reserved 2019.
+// All Rights Reserved 2019.
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
-#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 // #define DEBUG
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
-inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, float &local_x, float &local_y){
+                                      float &local_x, float &local_y) {
  // should rotate pi/2 + alpha to translate LiDAR to local
  float rot_angle = rz + M_PI / 2;
  float cosa = cos(rot_angle), sina = sin(rot_angle);
@@ -24,10 +23,11 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, fl
  local_y = shift_x * sina + shift_y * cosa;
 }
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
-inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &local_x, float &local_y){
+                                 float &local_x, float &local_y) {
  // param pt: (x, y, z)
-    // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
  float x = pt[0], y = pt[1], z = pt[2];
  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
@@ -35,15 +35,16 @@ inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &loc
  if (fabsf(z - cz) > h / 2.0) return 0;
  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
-    float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
  return in_flag;
 }
+int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor){
+                        at::Tensor pts_indices_tensor) {
-    // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
+  // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
-    // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
-    // params pts_indices: (N, npoints)
+  // in LiDAR coordinate params pts_indices: (N, npoints)
  CHECK_CONTIGUOUS(boxes_tensor);
  CHECK_CONTIGUOUS(pts_tensor);
@@ -52,14 +53,15 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
  int boxes_num = boxes_tensor.size(0);
  int pts_num = pts_tensor.size(0);
-    const float *boxes = boxes_tensor.data<float>();
+  const float *boxes = boxes_tensor.data_ptr<float>();
-    const float *pts = pts_tensor.data<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
-    int *pts_indices = pts_indices_tensor.data<int>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
  float local_x = 0, local_y = 0;
-    for (int i = 0; i < boxes_num; i++){
+  for (int i = 0; i < boxes_num; i++) {
-        for (int j = 0; j < pts_num; j++){
+    for (int j = 0; j < pts_num; j++) {
-            int cur_in_flag = check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
      pts_indices[i * pts_num + j] = cur_in_flag;
    }
  }

--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
-//Modified from
+// Modified from
-//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//Points in boxes gpu
+// Points in boxes gpu
-//Written by Shaoshuai Shi
+// Written by Shaoshuai Shi
-//All Rights Reserved 2019.
+// All Rights Reserved 2019.
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) \
-#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
 // #define DEBUG
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y){
+                                             float rz, float &local_x,
+                                             float &local_y) {
  // should rotate pi/2 + alpha to translate LiDAR to local
  float rot_angle = rz + M_PI / 2;
  float cosa = cos(rot_angle), sina = sin(rot_angle);
@@ -29,10 +32,11 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float
  local_y = shift_x * sina + shift_y * cosa;
 }
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){
+                                        float &local_x, float &local_y) {
  // param pt: (x, y, z)
-    // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
  float x = pt[0], y = pt[1], z = pt[2];
  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
@@ -40,16 +44,19 @@ __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, flo
  if (fabsf(z - cz) > h / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-    float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
  return in_flag;
 }
+__global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
-__global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const float *boxes,
+                                       int pts_num, const float *boxes,
-    const float *pts, int *box_idx_of_points){
+                                       const float *pts,
-    // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
+                                       int *box_idx_of_points) {
-    // params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
-    // params boxes_idx_of_points: (B, npoints), default -1
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
  int bs_idx = blockIdx.y;
  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -61,26 +68,28 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_nu
  float local_x = 0, local_y = 0;
  int cur_in_flag = 0;
-    for (int k = 0; k < boxes_num; k++){
+  for (int k = 0; k < boxes_num; k++) {
    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-        if (cur_in_flag){
+    if (cur_in_flag) {
      box_idx_of_points[0] = k;
      break;
    }
  }
 }
+void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
-void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const float *boxes,
+                              const float *boxes, const float *pts,
-    const float *pts, int *box_idx_of_points){
+                              int *box_idx_of_points) {
-    // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
-    // params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-    // params boxes_idx_of_points: (B, npoints), default -1
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
  cudaError_t err;
  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);
-    points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
+  points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
+                                              boxes, pts, box_idx_of_points);
  err = cudaGetLastError();
  if (cudaSuccess != err) {
@@ -93,10 +102,12 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const
 #endif
 }
-int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor){
+int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-    // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
+                        at::Tensor box_idx_of_points_tensor) {
-    // params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
-    // params boxes_idx_of_points: (B, npoints), default -1
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
  CHECK_INPUT(boxes_tensor);
  CHECK_INPUT(pts_tensor);
@@ -106,11 +117,12 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
  int boxes_num = boxes_tensor.size(1);
  int pts_num = pts_tensor.size(1);
-    const float *boxes = boxes_tensor.data<float>();
+  const float *boxes = boxes_tensor.data_ptr<float>();
-    const float *pts = pts_tensor.data<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
-    int *box_idx_of_points = box_idx_of_points_tensor.data<int>();
+  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-    points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
+  points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts,
+                           box_idx_of_points);
  return 1;
 }
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
-//Modified from
+// Modified from
-//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//RoI-aware point cloud feature pooling
+// RoI-aware point cloud feature pooling
-//Written by Shaoshuai Shi
+// Written by Shaoshuai Shi
-//All Rights Reserved 2019.
+// All Rights Reserved 2019.
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
 #include <assert.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#define CHECK_CUDA(x) \
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_CONTIGUOUS(x) \
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel,
+  CHECK_CONTIGUOUS(x)
-    int out_x, int out_y, int out_z, const float *rois, const float *pts, const float *pts_feature,
-    int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method);
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
+                              int max_pts_each_voxel, int out_x, int out_y,
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel,
+                              int out_z, const float *rois, const float *pts,
-    const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method);
+                              const float *pts_feature, int *argmax,
+                              int *pts_idx_of_voxels, float *pooled_features,
-int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax,
+                              int pool_method);
-    at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method);
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
-int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out,
+                                       int out_z, int channels,
+                                       int max_pts_each_voxel,
+                                       const int *pts_idx_of_voxels,
+                                       const int *argmax, const float *grad_out,
+                                       float *grad_in, int pool_method);
+int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
+                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
+                        at::Tensor pooled_features, int pool_method);
+int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
+                                 at::Tensor argmax, at::Tensor grad_out,
                                 at::Tensor grad_in, int pool_method);
-int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor pts_indices_tensor);
+int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor pts_indices_tensor);
-int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tensor box_idx_of_points_tensor);
+int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor box_idx_of_points_tensor);
-int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature, at::Tensor argmax, at::Tensor pts_idx_of_voxels, at::Tensor pooled_features, int pool_method){
+int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
+                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
+                        at::Tensor pooled_features, int pool_method) {
  // params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
  // params pts_feature: (npoints, C)
@@ -56,22 +69,27 @@ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
  int out_x = pts_idx_of_voxels.size(1);
  int out_y = pts_idx_of_voxels.size(2);
  int out_z = pts_idx_of_voxels.size(3);
-    assert ((out_x < 256) && (out_y < 256) && (out_z < 256));  // we encode index with 8bit
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
-    const float *rois_data = rois.data<float>();
+  const float *rois_data = rois.data_ptr<float>();
-    const float *pts_data = pts.data<float>();
+  const float *pts_data = pts.data_ptr<float>();
-    const float *pts_feature_data = pts_feature.data<float>();
+  const float *pts_feature_data = pts_feature.data_ptr<float>();
-    int *argmax_data = argmax.data<int>();
+  int *argmax_data = argmax.data_ptr<int>();
-    int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>();
+  int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
-    float *pooled_features_data = pooled_features.data<float>();
+  float *pooled_features_data = pooled_features.data_ptr<float>();
-    roiaware_pool3d_launcher(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+  roiaware_pool3d_launcher(
-        rois_data, pts_data, pts_feature_data, argmax_data, pts_idx_of_voxels_data, pooled_features_data, pool_method);
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois_data, pts_data, pts_feature_data, argmax_data,
+      pts_idx_of_voxels_data, pooled_features_data, pool_method);
  return 1;
 }
-int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax, at::Tensor grad_out, at::Tensor grad_in, int pool_method){
+int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
+                                 at::Tensor argmax, at::Tensor grad_out,
+                                 at::Tensor grad_in, int pool_method) {
  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
  // params argmax: (N, out_x, out_y, out_z, C)
  // params grad_out: (N, out_x, out_y, out_z, C)
@@ -90,20 +108,25 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax
  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
  int channels = grad_out.size(4);
-    const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data<int>();
+  const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
-    const int *argmax_data = argmax.data<int>();
+  const int *argmax_data = argmax.data_ptr<int>();
-    const float *grad_out_data = grad_out.data<float>();
+  const float *grad_out_data = grad_out.data_ptr<float>();
-    float *grad_in_data = grad_in.data<float>();
+  float *grad_in_data = grad_in.data_ptr<float>();
-    roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+  roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
-        pts_idx_of_voxels_data, argmax_data, grad_out_data, grad_in_data, pool_method);
+                                    max_pts_each_voxel, pts_idx_of_voxels_data,
+                                    argmax_data, grad_out_data, grad_in_data,
+                                    pool_method);
  return 1;
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
-    m.def("backward", &roiaware_pool3d_gpu_backward, "roiaware pool3d backward (CUDA)");
+  m.def("backward", &roiaware_pool3d_gpu_backward,
-    m.def("points_in_boxes_gpu", &points_in_boxes_gpu, "points_in_boxes_gpu forward (CUDA)");
+        "roiaware pool3d backward (CUDA)");
-    m.def("points_in_boxes_cpu", &points_in_boxes_cpu, "points_in_boxes_cpu forward (CPU)");
+  m.def("points_in_boxes_gpu", &points_in_boxes_gpu,
+        "points_in_boxes_gpu forward (CUDA)");
+  m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
+        "points_in_boxes_cpu forward (CPU)");
 }
--- a/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
@@ -26,9 +26,10 @@ namespace spconv {
 // torch.jit's doc says only support int64, so we need to convert to int32.
 template <typename T>
-torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+torch::Tensor fusedIndiceConvBatchNorm(
-                       torch::Tensor indicePairs, torch::Tensor indiceNum,
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-                       int64_t numActOut, int64_t _inverse, int64_t _subM) {
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter = std::max_element(
+  auto indicePairMaxSizeIter =
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
+  indicePairNumCpu.data_ptr<int>() + kernelVolume);
    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
    auto indicePairVecMaxSizeIter = std::max_element(
@@ -56,8 +60,10 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
  // auto indicePairOptions =
  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor output =
-  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
@@ -69,33 +75,34 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob =
+    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
+    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+                                            {nHot, numInPlanes}, options);
    if (device == torch::kCPU) {
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
    } else {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
      /* slower than SparseGatherFunctor, may due to int->long conversion
      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
-      indicePairOptions);
+      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      torch::index_select_out(inputBufferBlob, features, 0,
+      features, 0, indicePairBlob);*/
-      indicePairBlob);*/
    }
    // totalGatherTime += timer.report() / 1000.0;
    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  true);
+                  nHot, true);
    } else {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  true);
+                  nHot, true);
      TV_CHECK_CUDA_ERR();
    }
    // totalSAddTime += timer.report() / 1000.0;

--- a/mmdet3d/ops/spconv/include/spconv/pool_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/pool_ops.h
@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
  double totalTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }
@@ -60,7 +60,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
 template <typename T>
 torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
                                    torch::Tensor outFeatures,
-                                  torch::Tensor outGrad, torch::Tensor indicePairs,
+                                    torch::Tensor outGrad,
+                                    torch::Tensor indicePairs,
                                    torch::Tensor indiceNum) {
  auto device = features.device().type();
  auto numInPlanes = features.size(1);
@@ -70,7 +71,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  auto kernelVolume = indicePairs.size(0);
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0) {
      continue;
    }

--- a/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
@@ -25,8 +25,8 @@
 namespace spconv {
 // torch.jit's doc says only support int64, so we need to convert to int32.
 template <unsigned NDim>
-std::vector<torch::Tensor>
+std::vector<torch::Tensor> getIndicePair(
-getIndicePair(torch::Tensor indices, int64_t batchSize,
+    torch::Tensor indices, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
@@ -67,8 +67,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique =
+  auto indicePairUnique = torch::full(
-      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
@@ -88,16 +88,18 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
    }
    return {indices, indicePairs, indiceNum};
  } else {
@@ -105,19 +107,21 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
        torch::zeros({numAct * kernelVolume, coorDim + 1},
                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
    } else {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut =
+      numActOut = getIndicePairFtorP1(
-          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
@@ -137,8 +141,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
 }
 template <unsigned NDim>
-std::vector<torch::Tensor>
+std::vector<torch::Tensor> getIndicePairPreGrid(
-getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
    std::vector<int64_t> padding, std::vector<int64_t> dilation,
@@ -177,8 +181,8 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
  tv::SimpleVector<int, NDim> stride32;
  tv::SimpleVector<int, NDim> padding32;
  tv::SimpleVector<int, NDim> dilation32;
-  auto indicePairUnique =
+  auto indicePairUnique = torch::full(
-      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
      torch::dtype(torch::kInt32).device(indices.device()));
  for (int i = 0; i < NDim; ++i) {
    outSpatialShape32.push_back(outSpatialShape[i]);
@@ -198,17 +202,19 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::CPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
      gridOut.fill_(-1);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
-          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
-          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
    }
    return {indices, indicePairs, indiceNum};
  } else {
@@ -216,20 +222,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
        torch::zeros({numAct * kernelVolume, coorDim + 1},
                     torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) {
-      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
-          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
      gridOut.fill_(-1);
    } else {
      auto getIndicePairFtorP1 =
          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
      auto getIndicePairFtorP2 =
          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
-      numActOut =
+      numActOut = getIndicePairFtorP1(
-          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
@@ -241,15 +249,14 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
      }
    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
 }
 template <typename T>
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,
@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter = std::max_element(
+  auto indicePairMaxSizeIter =
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
+  indicePairNumCpu.data_ptr<int>() + kernelVolume);
    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
    auto indicePairVecMaxSizeIter = std::max_element(
@@ -282,7 +292,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
@@ -294,33 +305,34 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob =
+    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
+    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+                                            {nHot, numInPlanes}, options);
    if (device == torch::kCPU) {
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
    } else {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
      /* slower than SparseGatherFunctor, may due to int->long conversion
      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
-      indicePairOptions);
+      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      torch::index_select_out(inputBufferBlob, features, 0,
+      features, 0, indicePairBlob);*/
-      indicePairBlob);*/
    }
    // totalGatherTime += timer.report() / 1000.0;
    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  true);
+                  nHot, true);
    } else {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  true);
+                  nHot, true);
      TV_CHECK_CUDA_ERR();
    }
    // totalSAddTime += timer.report() / 1000.0;
@@ -349,9 +361,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
 }
 template <typename T>
-std::vector<torch::Tensor>
+std::vector<torch::Tensor> indiceConvBackward(torch::Tensor features,
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                                              torch::Tensor filters,
-                 torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
+                                              torch::Tensor outGrad,
+                                              torch::Tensor indicePairs,
+                                              torch::Tensor indiceNum,
                                              int64_t _inverse, int64_t _subM) {
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter = std::max_element(
+  auto indicePairMaxSizeIter =
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  auto filterShape = filters.sizes();
  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
  }
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                    nHot);
    } else {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
                    tv::torch2tv<const T>(outGrad),
-                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                    nHot);
      TV_CHECK_CUDA_ERR();
    }
    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob =
+    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
+    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+                                            {nHot, numInPlanes}, options);
    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                  nHot);
    } else {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
                  tv::torch2tv<const T>(inputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                  nHot);
      TV_CHECK_CUDA_ERR();
    }
  }
@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
 }
 template <typename T>
-torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
+torch::Tensor indiceConvDevelopDontUse(torch::Tensor features,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                                       torch::Tensor filters,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
  auto totalActsTen = indicePairNumCpu.sum();
-  auto totalActs = indicePairNumCpu.data<int>()[0];
+  auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
-  auto indicePairMaxSizeIter = std::max_element(
+  auto indicePairMaxSizeIter =
-      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
+  std::vector<int> indicePairNumVec(
-                              indicePairNumCpu.data<int>() + kernelVolume);
+      indicePairNumCpu.data_ptr<int>(),
+      indicePairNumCpu.data_ptr<int>() + kernelVolume);
  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-  int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
+  int subRuleMaxSize =
+      *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
  if (subM) {
    indicePairMaxSize = subRuleMaxSize;
  }
@@ -470,7 +500,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
  torch::Tensor outputBuffer =
      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  std::cout << "create time " << timer.report()/1000.0 << std::endl;
+  std::cout << "create time " << timer.report() / 1000.0 << std::endl;
  if (subM) {  // the center index of subm conv don't need gather and scatter
               // add.
    torch::mm_out(output, features, filters[indicePairMaxOffset]);
@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
  double totalSAddTime = 0;
  // auto timer = spconv::CudaContextTimer<>();
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
    //
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
                                            {nHot, numInPlanes}, options);
    if (device == torch::kCPU) {
      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
    } else {
      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                 nHot);
      TV_CHECK_CUDA_ERR();
    }
    // }
    // for (int i = 0; i < kernelVolume; ++i) {
    // totalGatherTime += timer.report() / 1000.0;
-    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
    // {nHot, numOutPlanes}, options);
  }
  // totalGatherTime += timer.report() / 1000.0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
                                            {nHot, numInPlanes}, options);
    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
  // totalGEMMTime += timer.report() / 1000.0;
  // totalGEMMTime += timer.report() / 1000.0;
  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data<int>()[i];
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
-    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
                                            {nHot, numInPlanes}, options);
    if (device == torch::kCPU) {
      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  true);
+                  nHot, true);
    } else {
      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
                  tv::torch2tv<const T>(outputBufferBlob),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  true);
+                  nHot, true);
      TV_CHECK_CUDA_ERR();
    }
    // totalSAddTime += timer.report() / 1000.0;

--- a/mmdet3d/ops/spconv/include/torch_utils.h
+++ b/mmdet3d/ops/spconv/include/torch_utils.h
@@ -13,20 +13,21 @@
 // limitations under the License.
 #pragma once
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
 namespace tv {
-struct TorchGPU: public tv::GPU {
+struct TorchGPU : public tv::GPU {
  virtual cudaStream_t getStream() const override {
    return at::cuda::getCurrentCUDAStream();
  }
 };
-template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
+template <typename T>
+void check_torch_dtype(const torch::Tensor &tensor) {
  switch (tensor.type().scalarType()) {
    case at::ScalarType::Double: {
      auto val = std::is_same<std::remove_const_t<T>, double>::value;
@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
  for (auto i : tensor.sizes()) {
    shape.push_back(i);
  }
-  return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
+  return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
 }
 }  // namespace tv
--- a/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
-#include <torch/extension.h>
 #include <ATen/TensorUtils.h>
+#include <torch/extension.h>
 // #include "voxelization.h"
 namespace {
 template <typename T_int>
-void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
+void determin_max_points_kernel(
-                                torch::TensorAccessor<T_int,1> point_to_voxelidx,
+    torch::TensorAccessor<T_int, 2> coor,
-                                torch::TensorAccessor<T_int,1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
-                                torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-                                int& voxel_num,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
-                                int& max_points,
+    int& max_points, const int num_points) {
-                                const int num_points
-                                ) {
  int voxelidx, num;
  for (int i = 0; i < num_points; ++i) {
-        if (coor[i][0] == -1)
+    if (coor[i][0] == -1) continue;
-            continue;
    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
    // record voxel
@@ -35,25 +29,21 @@ void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
    num_points_per_voxel[voxelidx] += 1;
    // update max points per voxel
-        max_points = std::max(max_points, num+1);
+    max_points = std::max(max_points, num + 1);
  }
  return;
 }
 template <typename T, typename T_int>
 void scatter_point_to_voxel_kernel(
-                const torch::TensorAccessor<T,2> points,
+    const torch::TensorAccessor<T, 2> points,
-                torch::TensorAccessor<T_int,2> coor,
+    torch::TensorAccessor<T_int, 2> coor,
-                torch::TensorAccessor<T_int,1> point_to_voxelidx,
+    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
-                torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
-                torch::TensorAccessor<T,3> voxels,
+    torch::TensorAccessor<T, 3> voxels,
-                torch::TensorAccessor<T_int,2> voxel_coors,
+    torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
-                const int num_features,
+    const int num_points, const int NDim) {
-                const int num_points,
-                const int NDim
-                ){
  for (int i = 0; i < num_points; ++i) {
    int num = point_to_voxelidx[i];
    int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
@@ -68,14 +58,11 @@ void scatter_point_to_voxel_kernel(
 }  // namespace
 namespace voxelization {
 std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
-    const at::Tensor& points,
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
-    const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
-    const std::vector<float> voxel_size,
-    const std::vector<float> coors_range) {
  // current version tooks about 0.02s_0.03s for one frame on cpu
  // check device
  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
@@ -86,46 +73,50 @@ std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
  std::vector<int> grid_size(NDim);
  for (int i = 0; i < NDim; ++i) {
-        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
-    at::Tensor num_points_per_voxel = at::zeros({num_points,}, voxel_mapping.options());
+  at::Tensor num_points_per_voxel = at::zeros(
-    at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
+      {
-    at::Tensor point_to_voxelidx = -at::ones({num_points,}, voxel_mapping.options());
+          num_points,
+      },
+      voxel_mapping.options());
+  at::Tensor coor_to_voxelidx = -at::ones(
+      {grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
+  at::Tensor point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
  int voxel_num = 0;
  int max_points = 0;
-    AT_DISPATCH_ALL_TYPES(voxel_mapping.type(), "determin_max_point", [&] {
+  AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
    determin_max_points_kernel<scalar_t>(
-            voxel_mapping.accessor<scalar_t,2>(),
+        voxel_mapping.accessor<scalar_t, 2>(),
-            point_to_voxelidx.accessor<scalar_t,1>(),
+        point_to_voxelidx.accessor<scalar_t, 1>(),
-            num_points_per_voxel.accessor<scalar_t,1>(),
+        num_points_per_voxel.accessor<scalar_t, 1>(),
-            coor_to_voxelidx.accessor<scalar_t,3>(),
+        coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
-            voxel_num,
+        num_points);
-            max_points,
-            num_points
-        );
  });
-    at::Tensor voxels = at::zeros({voxel_num, max_points, num_features}, points.options());
+  at::Tensor voxels =
-    at::Tensor voxel_coors = at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
+      at::zeros({voxel_num, max_points, num_features}, points.options());
+  at::Tensor voxel_coors =
+      at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
-    AT_DISPATCH_ALL_TYPES(points.type(), "scatter_point_to_voxel", [&] {
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
    scatter_point_to_voxel_kernel<scalar_t, int>(
-            points.accessor<scalar_t,2>(),
+        points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
-            voxel_mapping.accessor<int,2>(),
+        point_to_voxelidx.accessor<int, 1>(),
-            point_to_voxelidx.accessor<int,1>(),
+        coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
-            coor_to_voxelidx.accessor<int,3>(),
+        voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
-            voxels.accessor<scalar_t,3>(),
-            voxel_coors.accessor<int,2>(),
-            num_features,
-            num_points,
-            NDim
-        );
  });
-    at::Tensor num_points_per_voxel_out = num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
+  at::Tensor num_points_per_voxel_out =
+      num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
  return {voxels, voxel_coors, num_points_per_voxel_out};
 }
-}
+}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+++ b/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
@@ -6,7 +6,7 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #define CHECK_CUDA(x) \
-  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) \
@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
  dim3 threads(threadsPerBlock);
  cudaStream_t map_stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_ALL_TYPES(
-      voxel_mapping.type(), "determin_duplicate", ([&] {
+      voxel_mapping.scalar_type(), "determin_duplicate", ([&] {
        point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>(
            voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
            point_to_pointidx.data_ptr<int>(), num_points, NDim);
@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
      voxel_mapping.options());  // must be zero from the begining
  cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_ALL_TYPES(
-      voxel_mapping.type(), "determin_duplicate", ([&] {
+      voxel_mapping.scalar_type(), "determin_duplicate", ([&] {
        determin_voxel_num<int><<<1, 1, 0, logic_stream>>>(
            voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(),
            point_to_voxelidx.data_ptr<int>(),
@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
  dim3 cp_threads(threadsPerBlock, 4);
  cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_ALL_TYPES(
-      points.type(), "scatter_point_to_voxel", ([&] {
+      points.scalar_type(), "scatter_point_to_voxel", ([&] {
        scatter_point_to_voxel_kernel<float, int>
            <<<blocks, cp_threads, 0, cp_stream>>>(
                points.data_ptr<float>(), voxel_mapping.data_ptr<int>(),
@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
  dim3 blocks(col_blocks);
  dim3 cp_threads(threadsPerBlock, 4);
  cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_ALL_TYPES(grad_input_points.type(), "scatter_point_to_voxel",
+  AT_DISPATCH_ALL_TYPES(grad_input_points.scalar_type(),
-                        ([&] {
+                        "scatter_point_to_voxel", ([&] {
                          map_voxel_to_point_kernel<float, int>
                              <<<blocks, cp_threads, 0, cp_stream>>>(
                                  grad_input_points.data_ptr<float>(),

--- a/mmdet3d/ops/voxel/src/voxelization.h
+++ b/mmdet3d/ops/voxel/src/voxelization.h
@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
                         const std::vector<float> coors_range,
                         const int max_points, const int max_voxels,
                         const int NDim = 3) {
-  if (points.type().is_cuda()) {
+  if (points.device().is_cuda()) {
 #ifdef WITH_CUDA
    return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
                             voxel_size, coors_range, max_points, max_voxels,
@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
                             const std::vector<float> voxel_size,
                             const std::vector<float> coors_range,
                             const int NDim = 3) {
-  if (points.type().is_cuda()) {
+  if (points.device().is_cuda()) {
 #ifdef WITH_CUDA
    return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
 #else
@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
 inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
    const at::Tensor& points, const at::Tensor& voxel_mapping,
    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
-  if (points.type().is_cuda()) {
+  if (points.device().is_cuda()) {
 #ifdef WITH_CUDA
    return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size,
                                              coors_range);
@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
 inline void dynamic_point_to_voxel_backward(
    at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels,
    const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) {
-  if (grad_input_points.type().is_cuda()) {
+  if (grad_input_points.device().is_cuda()) {
 #ifdef WITH_CUDA
    return dynamic_point_to_voxel_backward_gpu(
        grad_input_points, grad_output_voxels, point_to_voxelidx,

--- a/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+++ b/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
-#include <torch/extension.h>
 #include <ATen/TensorUtils.h>
+#include <torch/extension.h>
 // #include "voxelization.h"
 namespace {
 template <typename T, typename T_int>
-void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
+void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
                             torch::TensorAccessor<T_int, 2> coors,
                             const std::vector<float> voxel_size,
                             const std::vector<float> coors_range,
                             const std::vector<int> grid_size,
-                             const int num_points,
+                             const int num_points, const int num_features,
-                             const int num_features,
+                             const int NDim) {
-                             const int NDim
-                             ) {
  const int ndim_minus_1 = NDim - 1;
  bool failed = false;
  int coor[NDim];
@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
  return;
 }
 template <typename T, typename T_int>
-void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
+void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
-                          torch::TensorAccessor<T,3> voxels,
+                          torch::TensorAccessor<T, 3> voxels,
-                          torch::TensorAccessor<T_int,2> coors,
+                          torch::TensorAccessor<T_int, 2> coors,
-                          torch::TensorAccessor<T_int,1> num_points_per_voxel,
+                          torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-                          torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                          torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
-                          int& voxel_num,
+                          int& voxel_num, const std::vector<float> voxel_size,
-                          const std::vector<float> voxel_size,
                          const std::vector<float> coors_range,
                          const std::vector<int> grid_size,
-                          const int max_points,
+                          const int max_points, const int max_voxels,
-                          const int max_voxels,
+                          const int num_points, const int num_features,
-                          const int num_points,
+                          const int NDim) {
-                          const int num_features,
-                          const int NDim
-                          ) {
  // declare a temp coors
-  at::Tensor temp_coors = at::zeros({num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
  // First use dynamic voxelization to get coors,
  // then check max points/voxels constraints
-  dynamic_voxelize_kernel<T, int>(
+  dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
-          points,
+                                  voxel_size, coors_range, grid_size,
-          temp_coors.accessor<int,2>(),
+                                  num_points, num_features, NDim);
-          voxel_size,
-          coors_range,
-          grid_size,
-          num_points,
-          num_features,
-          NDim
-  );
  int voxelidx, num;
-  auto coor = temp_coors.accessor<int,2>();
+  auto coor = temp_coors.accessor<int, 2>();
  for (int i = 0; i < num_points; ++i) {
    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
-    if (coor[i][0] == -1)
+    if (coor[i][0] == -1) continue;
-      continue;
    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
    // record voxel
    if (voxelidx == -1) {
      voxelidx = voxel_num;
-      if (max_voxels != -1 && voxel_num >= max_voxels)
+      if (max_voxels != -1 && voxel_num >= max_voxels) break;
-        break;
      voxel_num += 1;
      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
@@ -118,19 +100,14 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
 }  // namespace
 namespace voxelization {
-int hard_voxelize_cpu(
+int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
-    const at::Tensor& points,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
-    at::Tensor& voxels,
-    at::Tensor& coors,
-    at::Tensor& num_points_per_voxel,
                      const std::vector<float> voxel_size,
                      const std::vector<float> coors_range,
-    const int max_points,
+                      const int max_points, const int max_voxels,
-    const int max_voxels,
+                      const int NDim = 3) {
-    const int NDim=3) {
  // current version tooks about 0.02s_0.03s for one frame on cpu
  // check device
  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
@@ -140,43 +117,34 @@ int hard_voxelize_cpu(
  const int num_features = points.size(1);
  for (int i = 0; i < NDim; ++i) {
-        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-    //printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
-    at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
  int voxel_num = 0;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward", [&] {
        hard_voxelize_kernel<scalar_t, int>(
-            points.accessor<scalar_t,2>(),
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
-            voxels.accessor<scalar_t,3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
-            coors.accessor<int,2>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
-            num_points_per_voxel.accessor<int,1>(),
+            coors_range, grid_size, max_points, max_voxels, num_points,
-            coor_to_voxelidx.accessor<int,3>(),
+            num_features, NDim);
-            voxel_num,
-            voxel_size,
-            coors_range,
-            grid_size,
-            max_points,
-            max_voxels,
-            num_points,
-            num_features,
-            NDim
-        );
      });
  return voxel_num;
 }
+void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
-void dynamic_voxelize_cpu(
-    const at::Tensor& points,
-    at::Tensor& coors,
                          const std::vector<float> voxel_size,
                          const std::vector<float> coors_range,
-    const int NDim=3) {
+                          const int NDim = 3) {
  // check device
  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
@@ -185,24 +153,19 @@ void dynamic_voxelize_cpu(
  const int num_features = points.size(1);
  for (int i = 0; i < NDim; ++i) {
-        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
  }
  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward", [&] {
        dynamic_voxelize_kernel<scalar_t, int>(
-            points.accessor<scalar_t,2>(),
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
-            coors.accessor<int,2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
-            voxel_size,
-            coors_range,
-            grid_size,
-            num_points,
-            num_features,
-            NDim
-        );
      });
  return;
 }
-}
+}  // namespace voxelization
--- a/mmdet3d/ops/voxel/src/voxelization_cuda.cu
+++ b/mmdet3d/ops/voxel/src/voxelization_cuda.cu
@@ -6,7 +6,7 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #define CHECK_CUDA(x) \
-  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) \
@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
  // 1. link point to corresponding voxel coors
  AT_DISPATCH_ALL_TYPES(
-      points.type(), "hard_voxelize_kernel", ([&] {
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
        dynamic_voxelize_kernel<scalar_t, int>
            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
                points.contiguous().data_ptr<scalar_t>(),
@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
  dim3 map_block(512);
  AT_DISPATCH_ALL_TYPES(
-      temp_coors.type(), "determin_duplicate", ([&] {
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
        point_to_voxelidx_kernel<int>
            <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
                temp_coors.contiguous().data_ptr<int>(),
@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
      points.options().dtype(at::kInt));  // must be zero from the begining
  AT_DISPATCH_ALL_TYPES(
-      temp_coors.type(), "determin_duplicate", ([&] {
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
        determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
            num_points_per_voxel.contiguous().data_ptr<int>(),
            point_to_voxelidx.contiguous().data_ptr<int>(),
@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
  dim3 cp_block(512);
  AT_DISPATCH_ALL_TYPES(
-      points.type(), "assign_point_to_voxel", ([&] {
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
        assign_point_to_voxel<float, int>
            <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
                pts_output_size, points.contiguous().data_ptr<float>(),
@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
  dim3 coors_cp_block(512);
  AT_DISPATCH_ALL_TYPES(
-      points.type(), "assign_point_to_voxel", ([&] {
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
        assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
                                         at::cuda::getCurrentCUDAStream()>>>(
            coors_output_size, temp_coors.contiguous().data_ptr<int>(),