[Enhance] Replace mmdet3d ops with mmcv ops (#1240)

* import some ops from mmcv instead of mmdet3d * use mmcv ops in primitive_head.py * use mmcv ops in PAConv * remove ops in mmdet3d & fix some bugs * remove spconv & fix some bugs * fix voxelization unittest * remove spconv in ops/__init__.py * refine ops/__init__.py * recover sparse_block in ops/__init__ * fix parta2_bbox_head unittest * remove remaining ops * recover ops/__init__.py for bc breaking * add source of ops from mmcv * recover the unittest for voxelization * remove unittest

[Enhance] Replace mmdet3d ops with mmcv ops (#1240)
* import some ops from mmcv instead of mmdet3d * use mmcv ops in primitive_head.py * use mmcv ops in PAConv * remove ops in mmdet3d & fix some bugs * remove spconv & fix some bugs * fix voxelization unittest * remove spconv in ops/__init__.py * refine ops/__init__.py * recover sparse_block in ops/__init__ * fix parta2_bbox_head unittest * remove remaining ops * recover ops/__init__.py for bc breaking * add source of ops from mmcv * recover the unittest for voxelization * remove unittest
2f88c124 · Wenhao Wu · GitHub · 41d77dad · 2f88c124 · 41d77dad
Unverified Commit 2f88c124 authored Mar 23, 2022 by Wenhao Wu Committed by GitHub Mar 23, 2022
20 changed files
--- a/mmdet3d/ops/pointnet_modules/point_sa_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_sa_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from mmcv.cnn import ConvModule
+from mmcv.ops import GroupAll
+from mmcv.ops import PointsSampler as Points_Sampler
+from mmcv.ops import QueryAndGroup, gather_points
 from torch import nn as nn
 from torch.nn import functional as F
-from mmdet3d.ops import (GroupAll, PAConv, Points_Sampler, QueryAndGroup,
+from mmdet3d.ops import PAConv
-                         gather_points)
 from .builder import SA_MODULES

--- a/mmdet3d/ops/roiaware_pool3d/__init__.py
+++ b/mmdet3d/ops/roiaware_pool3d/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
-                              points_in_boxes_part)
-from .roiaware_pool3d import RoIAwarePool3d
-__all__ = [
-    'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu',
-    'points_in_boxes_all'
-]
--- a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+++ b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from . import roiaware_pool3d_ext
-def points_in_boxes_part(points, boxes):
-    """Find the box in which each point is (CUDA).
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
-            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
-    """
-    assert points.shape[0] == boxes.shape[0], \
-        f'Points and boxes should have the same batch size, ' \
-        f'got {points.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        f'boxes dimension should be 7, ' \
-        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        f'points dimension should be 3, ' \
-        f'got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
-                                       dtype=torch.int).fill_(-1)
-    # If manually put the tensor 'points' or 'boxes' on a device
-    # which is not the current device, some temporary variables
-    # will be created on the current device in the cuda op,
-    # and the output will be incorrect.
-    # Therefore, we force the current device to be the same
-    # as the device of the tensors if it was not.
-    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
-    # for the incorrect output before the fix.
-    points_device = points.get_device()
-    assert points_device == boxes.get_device(), \
-        'Points and boxes should be put on the same device'
-    if torch.cuda.current_device() != points_device:
-        torch.cuda.set_device(points_device)
-    roiaware_pool3d_ext.points_in_boxes_part(boxes.contiguous(),
-                                             points.contiguous(),
-                                             box_idxs_of_pts)
-    return box_idxs_of_pts
-def points_in_boxes_cpu(points, boxes):
-    """Find all boxes in which each point is (CPU). The CPU version of
-    :meth:`points_in_boxes_all`.
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in
-            LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-            (x, y, z) is the bottom center.
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
-    """
-    assert points.shape[0] == boxes.shape[0], \
-        f'Points and boxes should have the same batch size, ' \
-        f'got {points.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        f'boxes dimension should be 7, ' \
-        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        f'points dimension should be 3, ' \
-        f'got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    num_boxes = boxes.shape[1]
-    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
-                                     dtype=torch.int)
-    for b in range(batch_size):
-        roiaware_pool3d_ext.points_in_boxes_cpu(boxes[b].float().contiguous(),
-                                                points[b].float().contiguous(),
-                                                point_indices[b])
-    point_indices = point_indices.transpose(1, 2)
-    return point_indices
-def points_in_boxes_all(points, boxes):
-    """Find all boxes in which each point is (CUDA).
-    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-            (x, y, z) is the bottom center.
-    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
-    """
-    assert boxes.shape[0] == points.shape[0], \
-        f'Points and boxes should have the same batch size, ' \
-        f'got {boxes.shape[0]} and {boxes.shape[0]}'
-    assert boxes.shape[2] == 7, \
-        f'boxes dimension should be 7, ' \
-        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[2] == 3, \
-        f'points dimension should be 3, ' \
-        f'got unexpected shape {points.shape[2]}'
-    batch_size, num_points, _ = points.shape
-    num_boxes = boxes.shape[1]
-    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
-                                       dtype=torch.int).fill_(0)
-    # Same reason as line 25-32
-    points_device = points.get_device()
-    assert points_device == boxes.get_device(), \
-        'Points and boxes should be put on the same device'
-    if torch.cuda.current_device() != points_device:
-        torch.cuda.set_device(points_device)
-    roiaware_pool3d_ext.points_in_boxes_all(boxes.contiguous(),
-                                            points.contiguous(),
-                                            box_idxs_of_pts)
-    return box_idxs_of_pts
--- a/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
+++ b/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import torch
-from torch import nn as nn
-from torch.autograd import Function
-from . import roiaware_pool3d_ext
-class RoIAwarePool3d(nn.Module):
-    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
-        super().__init__()
-        """RoIAwarePool3d module
-        Args:
-            out_size (int or tuple): n or [n1, n2, n3]
-            max_pts_per_voxel (int): m
-            mode (str): 'max' or 'avg'
-        """
-        self.out_size = out_size
-        self.max_pts_per_voxel = max_pts_per_voxel
-        assert mode in ['max', 'avg']
-        pool_method_map = {'max': 0, 'avg': 1}
-        self.mode = pool_method_map[mode]
-    def forward(self, rois, pts, pts_feature):
-        """RoIAwarePool3d module forward.
-        Args:
-            rois (torch.Tensor): [N, 7],in LiDAR coordinate,
-                (x, y, z) is the bottom center of rois
-            pts (torch.Tensor): [npoints, 3]
-            pts_feature (torch.Tensor): [npoints, C]
-        Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
-        """
-        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
-                                            self.out_size,
-                                            self.max_pts_per_voxel, self.mode)
-class RoIAwarePool3dFunction(Function):
-    @staticmethod
-    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
-                mode):
-        """RoIAwarePool3d function forward.
-        Args:
-            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
-                (x, y, z) is the bottom center of rois
-            pts (torch.Tensor): [npoints, 3]
-            pts_feature (torch.Tensor): [npoints, C]
-            out_size (int or tuple): n or [n1, n2, n3]
-            max_pts_per_voxel (int): m
-            mode (int): 0 (max pool) or 1 (average pool)
-        Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
-        """
-        if isinstance(out_size, int):
-            out_x = out_y = out_z = out_size
-        else:
-            assert len(out_size) == 3
-            assert mmcv.is_tuple_of(out_size, int)
-            out_x, out_y, out_z = out_size
-        num_rois = rois.shape[0]
-        num_channels = pts_feature.shape[-1]
-        num_pts = pts.shape[0]
-        pooled_features = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels))
-        argmax = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
-        pts_idx_of_voxels = pts_feature.new_zeros(
-            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
-            dtype=torch.int)
-        roiaware_pool3d_ext.forward(rois, pts, pts_feature, argmax,
-                                    pts_idx_of_voxels, pooled_features, mode)
-        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
-                                            num_pts, num_channels)
-        return pooled_features
-    @staticmethod
-    def backward(ctx, grad_out):
-        """RoIAwarePool3d function forward.
-        Args:
-            grad_out (torch.Tensor): [N, out_x, out_y, out_z, C]
-        Returns:
-            grad_in (torch.Tensor): [npoints, C]
-        """
-        ret = ctx.roiaware_pool3d_for_backward
-        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
-        grad_in = grad_out.new_zeros((num_pts, num_channels))
-        roiaware_pool3d_ext.backward(pts_idx_of_voxels, argmax,
-                                     grad_out.contiguous(), grad_in, mode)
-        return None, None, grad_in, None, None, None
-if __name__ == '__main__':
-    pass
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-// #define DEBUG
-inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
-                                      float &local_x, float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
-                                 float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor pts_indices_tensor) {
-  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is the
-  // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
-  // in LiDAR coordinate params pts_indices: (N, npoints)
-  CHECK_CONTIGUOUS(boxes_tensor);
-  CHECK_CONTIGUOUS(pts_tensor);
-  CHECK_CONTIGUOUS(pts_indices_tensor);
-  int boxes_num = boxes_tensor.size(0);
-  int pts_num = pts_tensor.size(0);
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *pts_indices = pts_indices_tensor.data_ptr<int>();
-  float local_x = 0, local_y = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    for (int j = 0; j < pts_num; j++) {
-      int cur_in_flag =
-          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
-      pts_indices[i * pts_num + j] = cur_in_flag;
-    }
-  }
-  return 1;
-}
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <torch/serialize/tensor.h>
-#include <torch/types.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-// #define DEBUG
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-                                             float rz, float &local_x,
-                                             float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-                                        float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-__global__ void points_in_boxes_part_kernel(int batch_size, int boxes_num,
-                                            int pts_num, const float *boxes,
-                                            const float *pts,
-                                            int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
-  boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num + pt_idx;
-  float local_x = 0, local_y = 0;
-  int cur_in_flag = 0;
-  for (int k = 0; k < boxes_num; k++) {
-    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
-      box_idx_of_points[0] = k;
-      break;
-    }
-  }
-}
-__global__ void points_in_boxes_all_kernel(int batch_size, int boxes_num,
-                                           int pts_num, const float *boxes,
-                                           const float *pts,
-                                           int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
-  boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
-  float local_x = 0, local_y = 0;
-  int cur_in_flag = 0;
-  for (int k = 0; k < boxes_num; k++) {
-    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
-      box_idx_of_points[k] = 1;
-    }
-    cur_in_flag = 0;
-  }
-}
-void points_in_boxes_part_launcher(int batch_size, int boxes_num, int pts_num,
-                                   const float *boxes, const float *pts,
-                                   int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  cudaError_t err;
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
-  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_part_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
-                                                   boxes, pts, box_idx_of_points);
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-void points_in_boxes_all_launcher(int batch_size, int boxes_num, int pts_num,
-                                  const float *boxes, const float *pts,
-                                  int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
-  // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
-  cudaError_t err;
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
-  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_all_kernel<<<blocks, threads>>>(
-      batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
-  err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                         at::Tensor box_idx_of_points_tensor) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
-  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
-  // -1
-  CHECK_INPUT(boxes_tensor);
-  CHECK_INPUT(pts_tensor);
-  CHECK_INPUT(box_idx_of_points_tensor);
-  int batch_size = boxes_tensor.size(0);
-  int boxes_num = boxes_tensor.size(1);
-  int pts_num = pts_tensor.size(1);
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-  points_in_boxes_part_launcher(batch_size, boxes_num, pts_num, boxes, pts,
-                                box_idx_of_points);
-  return 1;
-}
-int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor) {
-  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
-  // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
-  // coordinate params boxes_idx_of_points: (B, npoints), default -1
-  CHECK_INPUT(boxes_tensor);
-  CHECK_INPUT(pts_tensor);
-  CHECK_INPUT(box_idx_of_points_tensor);
-  int batch_size = boxes_tensor.size(0);
-  int boxes_num = boxes_tensor.size(1);
-  int pts_num = pts_tensor.size(1);
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-  points_in_boxes_all_launcher(batch_size, boxes_num, pts_num, boxes, pts,
-                               box_idx_of_points);
-  return 1;
-}
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <torch/extension.h>
-#include <torch/serialize/tensor.h>
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
-                              int max_pts_each_voxel, int out_x, int out_y,
-                              int out_z, const float *rois, const float *pts,
-                              const float *pts_feature, int *argmax,
-                              int *pts_idx_of_voxels, float *pooled_features,
-                              int pool_method);
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
-                                       int out_z, int channels,
-                                       int max_pts_each_voxel,
-                                       const int *pts_idx_of_voxels,
-                                       const int *argmax, const float *grad_out,
-                                       float *grad_in, int pool_method);
-int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
-                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
-                        at::Tensor pooled_features, int pool_method);
-int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
-                                 at::Tensor argmax, at::Tensor grad_out,
-                                 at::Tensor grad_in, int pool_method);
-int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor pts_indices_tensor);
-int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                         at::Tensor box_idx_of_points_tensor);
-int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor);
-int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
-                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
-                        at::Tensor pooled_features, int pool_method) {
-  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coordinate
-  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
-  // params pts_feature: (npoints, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params pooled_features: (N, out_x, out_y, out_z, C)
-  // params pool_method: 0: max_pool 1: avg_pool
-  CHECK_INPUT(rois);
-  CHECK_INPUT(pts);
-  CHECK_INPUT(pts_feature);
-  CHECK_INPUT(argmax);
-  CHECK_INPUT(pts_idx_of_voxels);
-  CHECK_INPUT(pooled_features);
-  int boxes_num = rois.size(0);
-  int pts_num = pts.size(0);
-  int channels = pts_feature.size(1);
-  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
-  int out_x = pts_idx_of_voxels.size(1);
-  int out_y = pts_idx_of_voxels.size(2);
-  int out_z = pts_idx_of_voxels.size(3);
-  assert((out_x < 256) && (out_y < 256) &&
-         (out_z < 256));  // we encode index with 8bit
-  const float *rois_data = rois.data_ptr<float>();
-  const float *pts_data = pts.data_ptr<float>();
-  const float *pts_feature_data = pts_feature.data_ptr<float>();
-  int *argmax_data = argmax.data_ptr<int>();
-  int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
-  float *pooled_features_data = pooled_features.data_ptr<float>();
-  roiaware_pool3d_launcher(
-      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-      rois_data, pts_data, pts_feature_data, argmax_data,
-      pts_idx_of_voxels_data, pooled_features_data, pool_method);
-  return 1;
-}
-int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
-                                 at::Tensor argmax, at::Tensor grad_out,
-                                 at::Tensor grad_in, int pool_method) {
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  // params pool_method: 0: max_pool 1: avg_pool
-  CHECK_INPUT(pts_idx_of_voxels);
-  CHECK_INPUT(argmax);
-  CHECK_INPUT(grad_out);
-  CHECK_INPUT(grad_in);
-  int boxes_num = pts_idx_of_voxels.size(0);
-  int out_x = pts_idx_of_voxels.size(1);
-  int out_y = pts_idx_of_voxels.size(2);
-  int out_z = pts_idx_of_voxels.size(3);
-  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
-  int channels = grad_out.size(4);
-  const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
-  const int *argmax_data = argmax.data_ptr<int>();
-  const float *grad_out_data = grad_out.data_ptr<float>();
-  float *grad_in_data = grad_in.data_ptr<float>();
-  roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
-                                    max_pts_each_voxel, pts_idx_of_voxels_data,
-                                    argmax_data, grad_out_data, grad_in_data,
-                                    pool_method);
-  return 1;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
-  m.def("backward", &roiaware_pool3d_gpu_backward,
-        "roiaware pool3d backward (CUDA)");
-  m.def("points_in_boxes_part", &points_in_boxes_part,
-        "points_in_boxes_part forward (CUDA)");
-  m.def("points_in_boxes_all", &points_in_boxes_all,
-        "points_in_boxes_all forward (CUDA)");
-  m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
-        "points_in_boxes_cpu forward (CPU)");
-}
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Modified from
-// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-// Written by Shaoshuai Shi
-// All Rights Reserved 2019.
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <torch/serialize/tensor.h>
-#include <torch/types.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-// #define DEBUG
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-                                             float rz, float &local_x,
-                                             float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-                                        float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
-                                            int out_x, int out_y, int out_z,
-                                            const float *rois, const float *pts,
-                                            int *pts_mask) {
-  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
-  // params pts: (npoints, 3) [x, y, z]
-  // params pts_mask: (N, npoints): -1 means point does not in this box,
-  // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int box_idx = blockIdx.y;
-  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
-  pts += pt_idx * 3;
-  rois += box_idx * 7;
-  pts_mask += box_idx * pts_num + pt_idx;
-  float local_x = 0, local_y = 0;
-  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
-  pts_mask[0] = -1;
-  if (cur_in_flag > 0) {
-    float local_z = pts[2] - rois[2];
-    float x_size = rois[3], y_size = rois[4], z_size = rois[5];
-    float x_res = x_size / out_x;
-    float y_res = y_size / out_y;
-    float z_res = z_size / out_z;
-    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
-    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
-    unsigned int z_idx = int(local_z / z_res);
-    x_idx = min(max(x_idx, 0), out_x - 1);
-    y_idx = min(max(y_idx, 0), out_y - 1);
-    z_idx = min(max(z_idx, 0), out_z - 1);
-    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
-#ifdef DEBUG
-    printf(
-        "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
-        "%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
-        pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
-        z_idx, x_res, y_res, z_res, idx_encoding);
-#endif
-    pts_mask[0] = idx_encoding;
-  }
-}
-__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
-                                             int max_pts_each_voxel, int out_x,
-                                             int out_y, int out_z,
-                                             const int *pts_mask,
-                                             int *pts_idx_of_voxels) {
-  // params pts_mask: (N, npoints)  0 or 1
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (box_idx >= boxes_num) return;
-  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-  for (int k = 0; k < pts_num; k++) {
-    if (pts_mask[box_idx * pts_num + k] != -1) {
-      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-      unsigned int z_idx = idx_encoding & 0xFF;
-      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
-                                 y_idx * out_z * max_pts_each_voxel +
-                                 z_idx * max_pts_each_voxel;
-      unsigned int cnt = pts_idx_of_voxels[base_offset];
-      if (cnt < max_num_pts) {
-        pts_idx_of_voxels[base_offset + cnt + 1] = k;
-        pts_idx_of_voxels[base_offset]++;
-      }
-#ifdef DEBUG
-      printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
-             y_idx, z_idx, idx_encoding);
-#endif
-    }
-  }
-}
-__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
-                                   int max_pts_each_voxel, int out_x, int out_y,
-                                   int out_z, const float *pts_feature,
-                                   const int *pts_idx_of_voxels,
-                                   float *pooled_features, int *argmax) {
-  // params pts_feature: (npoints, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
-  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-#ifdef DEBUG
-  printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
-         argmax);
-#endif
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
-                     offset_base * channels + channel_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-  int argmax_idx = -1;
-  float max_val = -1e50;
-  int total_pts = pts_idx_of_voxels[0];
-  for (int k = 1; k <= total_pts; k++) {
-    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
-      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-      argmax_idx = pts_idx_of_voxels[k];
-    }
-  }
-  if (argmax_idx != -1) {
-    pooled_features[0] = max_val;
-  }
-  argmax[0] = argmax_idx;
-#ifdef DEBUG
-  printf(
-      "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
-      "pts_idx: %p, argmax: (%p, %d)\n",
-      channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
-      pts_idx_of_voxels, argmax, argmax_idx);
-#endif
-}
-__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
-                                   int max_pts_each_voxel, int out_x, int out_y,
-                                   int out_z, const float *pts_feature,
-                                   const int *pts_idx_of_voxels,
-                                   float *pooled_features) {
-  // params pts_feature: (npoints, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
-  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
-                     offset_base * channels + channel_idx;
-  float sum_val = 0;
-  int total_pts = pts_idx_of_voxels[0];
-  for (int k = 1; k <= total_pts; k++) {
-    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-  }
-  if (total_pts > 0) {
-    pooled_features[0] = sum_val / total_pts;
-  }
-}
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
-                              int max_pts_each_voxel, int out_x, int out_y,
-                              int out_z, const float *rois, const float *pts,
-                              const float *pts_feature, int *argmax,
-                              int *pts_idx_of_voxels, float *pooled_features,
-                              int pool_method) {
-  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
-  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
-  // params pts_feature: (npoints, C)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params pooled_features: (N, out_x, out_y, out_z, C)
-  // params pool_method: 0: max_pool 1: avg_pool
-  int *pts_mask = NULL;
-  cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
-  cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
-  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
-  dim3 threads(THREADS_PER_BLOCK);
-  generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
-      boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
-  // TODO: Merge the collect and pool functions, SS
-  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
-  collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
-      boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
-      pts_idx_of_voxels);
-  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
-                   boxes_num);
-  if (pool_method == 0) {
-    roiaware_maxpool3d<<<blocks_pool, threads>>>(
-        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-        pts_feature, pts_idx_of_voxels, pooled_features, argmax);
-  } else if (pool_method == 1) {
-    roiaware_avgpool3d<<<blocks_pool, threads>>>(
-        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-        pts_feature, pts_idx_of_voxels, pooled_features);
-  }
-  cudaFree(pts_mask);
-#ifdef DEBUG
-  cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
-__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
-                                            int out_x, int out_y, int out_z,
-                                            const int *argmax,
-                                            const float *grad_out,
-                                            float *grad_in) {
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
-              offset_base * channels + channel_idx;
-  if (argmax[0] == -1) return;
-  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
-}
-__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
-                                            int out_x, int out_y, int out_z,
-                                            int max_pts_each_voxel,
-                                            const int *pts_idx_of_voxels,
-                                            const float *grad_out,
-                                            float *grad_in) {
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  int box_idx = blockIdx.z;
-  int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
-              offset_base * channels + channel_idx;
-  int total_pts = pts_idx_of_voxels[0];
-  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
-  for (int k = 1; k <= total_pts; k++) {
-    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
-              grad_out[0] * cur_grad);
-  }
-}
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
-                                       int out_z, int channels,
-                                       int max_pts_each_voxel,
-                                       const int *pts_idx_of_voxels,
-                                       const int *argmax, const float *grad_out,
-                                       float *grad_in, int pool_method) {
-  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-  // params argmax: (N, out_x, out_y, out_z, C)
-  // params grad_out: (N, out_x, out_y, out_z, C)
-  // params grad_in: (npoints, C), return value
-  // params pool_method: 0: max_pool, 1: avg_pool
-  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
-              boxes_num);
-  dim3 threads(THREADS_PER_BLOCK);
-  if (pool_method == 0) {
-    roiaware_maxpool3d_backward<<<blocks, threads>>>(
-        boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
-  } else if (pool_method == 1) {
-    roiaware_avgpool3d_backward<<<blocks, threads>>>(
-        boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
-        pts_idx_of_voxels, grad_out, grad_in);
-  }
-}
--- a/mmdet3d/ops/roipoint_pool3d/__init__.py
+++ b/mmdet3d/ops/roipoint_pool3d/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from .roipoint_pool3d import RoIPointPool3d
-__all__ = ['RoIPointPool3d']
--- a/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
+++ b/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn as nn
-from torch.autograd import Function
-from . import roipoint_pool3d_ext
-class RoIPointPool3d(nn.Module):
-    def __init__(self, num_sampled_points=512):
-        super().__init__()
-        """
-        Args:
-            num_sampled_points (int): Number of samples in each roi
-        """
-        self.num_sampled_points = num_sampled_points
-    def forward(self, points, point_features, boxes3d):
-        """
-        Args:
-            points (torch.Tensor): Input points whose shape is BxNx3
-            point_features: (B, N, C)
-            boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
-        Returns:
-            torch.Tensor: (B, M, 512, 3 + C) pooled_features
-            torch.Tensor: (B, M) pooled_empty_flag
-        """
-        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
-                                            self.num_sampled_points)
-class RoIPointPool3dFunction(Function):
-    @staticmethod
-    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
-        """
-        Args:
-            points (torch.Tensor): Input points whose shape is (B, N, 3)
-            point_features (torch.Tensor): Input points features shape is \
-                (B, N, C)
-            boxes3d (torch.Tensor): Input bounding boxes whose shape is \
-                (B, M, 7)
-            num_sampled_points (int): the num of sampled points
-        Returns:
-            torch.Tensor: (B, M, 512, 3 + C) pooled_features
-            torch.Tensor: (B, M) pooled_empty_flag
-        """
-        assert points.shape.__len__() == 3 and points.shape[2] == 3
-        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
-            1], point_features.shape[2]
-        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
-        pooled_features = point_features.new_zeros(
-            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
-        pooled_empty_flag = point_features.new_zeros(
-            (batch_size, boxes_num)).int()
-        roipoint_pool3d_ext.forward(points.contiguous(),
-                                    pooled_boxes3d.contiguous(),
-                                    point_features.contiguous(),
-                                    pooled_features, pooled_empty_flag)
-        return pooled_features, pooled_empty_flag
-    @staticmethod
-    def backward(ctx, grad_out):
-        raise NotImplementedError
-if __name__ == '__main__':
-    pass
--- a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
-/*
-Modified for
-https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
-Point cloud feature pooling
-Written by Shaoshuai Shi
-All Rights Reserved 2018.
-*/
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
-#define CHECK_CUDA(x) do { \
-  if (!x.type().is_cuda()) { \
-    fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
-    exit(-1); \
-  } \
-} while (0)
-#define CHECK_CONTIGUOUS(x) do { \
-  if (!x.is_contiguous()) { \
-    fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
-    exit(-1); \
-  } \
-} while (0)
-#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
-void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
-                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag);
-int roipool3d_gpu(at::Tensor xyz, at::Tensor boxes3d, at::Tensor pts_feature, at::Tensor pooled_features, at::Tensor pooled_empty_flag){
-    // params xyz: (B, N, 3)
-    // params boxes3d: (B, M, 7)
-    // params pts_feature: (B, N, C)
-    // params pooled_features: (B, M, 512, 3+C)
-    // params pooled_empty_flag: (B, M)
-    CHECK_INPUT(xyz);
-    CHECK_INPUT(boxes3d);
-    CHECK_INPUT(pts_feature);
-    CHECK_INPUT(pooled_features);
-    CHECK_INPUT(pooled_empty_flag);
-    int batch_size = xyz.size(0);
-    int pts_num = xyz.size(1);
-    int boxes_num = boxes3d.size(1);
-    int feature_in_len = pts_feature.size(2);
-    int sampled_pts_num = pooled_features.size(2);
-    const float * xyz_data = xyz.data<float>();
-    const float * boxes3d_data = boxes3d.data<float>();
-    const float * pts_feature_data = pts_feature.data<float>();
-    float * pooled_features_data = pooled_features.data<float>();
-    int * pooled_empty_flag_data = pooled_empty_flag.data<int>();
-    roipool3dLauncher(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-                       xyz_data, boxes3d_data, pts_feature_data, pooled_features_data, pooled_empty_flag_data);
-    return 1;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &roipool3d_gpu, "roipool3d forward (CUDA)");
-}
--- a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
-/*
-Modified from
-https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
-Point cloud feature pooling
-Written by Shaoshuai Shi
-All Rights Reserved 2018.
-*/
-#include <math.h>
-#include <stdio.h>
-#define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
-// #define DEBUG
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
-                                             float rz, float &local_x,
-                                             float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
-                                        float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
-  // bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
-  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
-  if (fabsf(z - cz) > dz / 2.0) return 0;
-  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
-                  (local_y > -dy / 2.0) & (local_y < dy / 2.0);
-  return in_flag;
-}
-__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){
-    // params xyz: (B, N, 3)
-    // params boxes3d: (B, M, 7)
-    // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int box_idx = blockIdx.y;
-    int bs_idx = blockIdx.z;
-    if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
-        return;
-    }
-    int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-    pts_assign[assign_idx] = 0;
-    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
-    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
-    float local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y);
-    pts_assign[assign_idx] = cur_in_flag;
-    // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
-}
-__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num,
-                               const int *pts_assign, int *pts_idx, int *pooled_empty_flag){
-    // params xyz: (B, N, 3)
-    // params pts_feature: (B, N, C)
-    // params pts_assign: (B, N)
-    // params pts_idx: (B, M, 512)
-    // params pooled_empty_flag: (B, M)
-    int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (boxes_idx >= boxes_num){
-        return;
-    }
-    int bs_idx = blockIdx.y;
-    int cnt = 0;
-    for (int k = 0; k < pts_num; k++){
-        if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){
-            if (cnt < sampled_pts_num){
-                pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k;
-                cnt++;
-            }
-            else break;
-        }
-    }
-    if (cnt == 0){
-        pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
-    }
-    else if (cnt < sampled_pts_num){
-        // duplicate same points for sampling
-        for (int k = cnt; k < sampled_pts_num; k++){
-            int duplicate_idx = k % cnt;
-            int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
-            pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
-        }
-    }
-}
-__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
-                                   const float *xyz, const int *pts_idx, const float *pts_feature,
-                                   float *pooled_features, int *pooled_empty_flag){
-    // params xyz: (B, N, 3)
-    // params pts_idx: (B, M, 512)
-    // params pts_feature: (B, N, C)
-    // params pooled_features: (B, M, 512, 3+C)
-    // params pooled_empty_flag: (B, M)
-    int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int box_idx = blockIdx.y;
-    int bs_idx = blockIdx.z;
-    if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
-        return;
-    }
-    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){
-        return;
-    }
-    int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx;
-    int src_pt_idx = pts_idx[temp_idx];
-    int dst_feature_offset = temp_idx * (3 + feature_in_len);
-    for (int j = 0; j < 3; j++)
-        pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
-    int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
-    for (int j = 0; j < feature_in_len; j++)
-        pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j];
-}
-void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
-                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){
-    // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
-    int *pts_assign = NULL;
-    cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int));  // (batch_size, N, M)
-    // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
-    dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);  // blockIdx.x(col), blockIdx.y(row)
-    dim3 threads(THREADS_PER_BLOCK);
-    assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign);
-    int *pts_idx = NULL;
-    cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int));  // (batch_size, M, sampled_pts_num)
-    dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);  // blockIdx.x(col), blockIdx.y(row)
-    get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag);
-    dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
-    roipool3d_forward<<<blocks_pool, threads>>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-                                                      xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
-    cudaFree(pts_assign);
-    cudaFree(pts_idx);
-#ifdef DEBUG
-    cudaDeviceSynchronize();  // for using printf in kernel function
-#endif
-}
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.ops import SparseModule, SparseSequential
 from torch import nn
-from mmdet3d.ops import spconv
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
-class SparseBottleneck(Bottleneck, spconv.SparseModule):
+class SparseBottleneck(Bottleneck, SparseModule):
    """Sparse bottleneck block for PartA^2.
    Bottleneck block implemented with submanifold sparse convolution.
@@ -32,7 +32,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
                 conv_cfg=None,
                 norm_cfg=None):
-        spconv.SparseModule.__init__(self)
+        SparseModule.__init__(self)
        Bottleneck.__init__(
            self,
            inplanes,
@@ -65,7 +65,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
        return out
-class SparseBasicBlock(BasicBlock, spconv.SparseModule):
+class SparseBasicBlock(BasicBlock, SparseModule):
    """Sparse basic block for PartA^2.
    Sparse basic block implemented with submanifold sparse convolution.
@@ -90,7 +90,7 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule):
                 downsample=None,
                 conv_cfg=None,
                 norm_cfg=None):
-        spconv.SparseModule.__init__(self)
+        SparseModule.__init__(self)
        BasicBlock.__init__(
            self,
            inplanes,
@@ -182,5 +182,5 @@ def make_sparse_convmodule(in_channels,
        elif layer == 'act':
            layers.append(nn.ReLU(inplace=True))
-    layers = spconv.SparseSequential(*layers)
+    layers = SparseSequential(*layers)
    return layers
--- a/mmdet3d/ops/spconv/__init__.py
+++ b/mmdet3d/ops/spconv/__init__.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
-                   SparseConvTranspose3d, SparseInverseConv2d,
-                   SparseInverseConv3d, SubMConv2d, SubMConv3d)
-from .modules import SparseModule, SparseSequential
-from .pool import SparseMaxPool2d, SparseMaxPool3d
-from .structure import SparseConvTensor, scatter_nd
-__all__ = [
-    'SparseConv2d',
-    'SparseConv3d',
-    'SubMConv2d',
-    'SubMConv3d',
-    'SparseConvTranspose2d',
-    'SparseConvTranspose3d',
-    'SparseInverseConv2d',
-    'SparseInverseConv3d',
-    'SparseModule',
-    'SparseSequential',
-    'SparseMaxPool2d',
-    'SparseMaxPool3d',
-    'SparseConvTensor',
-    'scatter_nd',
-]
--- a/mmdet3d/ops/spconv/conv.py
+++ b/mmdet3d/ops/spconv/conv.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import numpy as np
-import torch
-from mmcv.cnn import CONV_LAYERS
-from torch.nn import init
-from torch.nn.parameter import Parameter
-from . import functional as Fsp
-from . import ops
-from .modules import SparseModule
-from .structure import SparseConvTensor
-def _calculate_fan_in_and_fan_out_hwio(tensor):
-    dimensions = tensor.ndimension()
-    if dimensions < 2:
-        raise ValueError('fan in and fan out can not be computed for tensor'
-                         'with fewer than 2 dimensions')
-    if dimensions == 2:  # Linear
-        fan_in = tensor.size(-2)
-        fan_out = tensor.size(-1)
-    else:
-        num_input_fmaps = tensor.size(-2)
-        num_output_fmaps = tensor.size(-1)
-        receptive_field_size = 1
-        if tensor.dim() > 2:
-            receptive_field_size = tensor[..., 0, 0].numel()
-        fan_in = num_input_fmaps * receptive_field_size
-        fan_out = num_output_fmaps * receptive_field_size
-    return fan_in, fan_out
-class SparseConvolution(SparseModule):
-    def __init__(self,
-                 ndim,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 subm=False,
-                 output_padding=0,
-                 transposed=False,
-                 inverse=False,
-                 indice_key=None,
-                 fused_bn=False):
-        super(SparseConvolution, self).__init__()
-        assert groups == 1
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
-        if not isinstance(output_padding, (list, tuple)):
-            output_padding = [output_padding] * ndim
-        for d, s in zip(dilation, stride):
-            assert any([s == 1, d == 1]), "don't support this."
-        self.ndim = ndim
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.conv1x1 = np.prod(kernel_size) == 1
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.transposed = transposed
-        self.inverse = inverse
-        self.output_padding = output_padding
-        self.groups = groups
-        self.subm = subm
-        self.indice_key = indice_key
-        self.fused_bn = fused_bn
-        self.weight = Parameter(
-            torch.Tensor(*kernel_size, in_channels, out_channels))
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-    def reset_parameters(self):
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-        if self.bias is not None:
-            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
-            bound = 1 / math.sqrt(fan_in)
-            init.uniform_(self.bias, -bound, bound)
-    def forward(self, input):
-        assert isinstance(input, SparseConvTensor)
-        features = input.features
-        device = features.device
-        indices = input.indices
-        spatial_shape = input.spatial_shape
-        batch_size = input.batch_size
-        if not self.subm:
-            if self.transposed:
-                out_spatial_shape = ops.get_deconv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding,
-                    self.dilation, self.output_padding)
-            else:
-                out_spatial_shape = ops.get_conv_output_size(
-                    spatial_shape, self.kernel_size, self.stride, self.padding,
-                    self.dilation)
-        else:
-            out_spatial_shape = spatial_shape
-        # input.update_grid(out_spatial_shape)
-        # t = time.time()
-        if self.conv1x1:
-            features = torch.mm(
-                input.features,
-                self.weight.view(self.in_channels, self.out_channels))
-            if self.bias is not None:
-                features += self.bias
-            out_tensor = SparseConvTensor(features, input.indices,
-                                          input.spatial_shape,
-                                          input.batch_size)
-            out_tensor.indice_dict = input.indice_dict
-            out_tensor.grid = input.grid
-            return out_tensor
-        data = input.find_indice_pair(self.indice_key)
-        if self.inverse:
-            assert data is not None and self.indice_key is not None
-            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
-            assert indice_pairs.shape[0] == np.prod(
-                self.kernel_size
-            ), 'inverse conv must have same kernel size as its couple conv'
-        else:
-            if self.indice_key is not None and data is not None:
-                outids, _, indice_pairs, indice_pair_num, _ = data
-            else:
-                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
-                    indices,
-                    batch_size,
-                    spatial_shape,
-                    self.kernel_size,
-                    self.stride,
-                    self.padding,
-                    self.dilation,
-                    self.output_padding,
-                    self.subm,
-                    self.transposed,
-                    grid=input.grid)
-                input.indice_dict[self.indice_key] = (outids, indices,
-                                                      indice_pairs,
-                                                      indice_pair_num,
-                                                      spatial_shape)
-        if self.fused_bn:
-            assert self.bias is not None
-            out_features = ops.fused_indice_conv(features, self.weight,
-                                                 self.bias,
-                                                 indice_pairs.to(device),
-                                                 indice_pair_num,
-                                                 outids.shape[0], self.inverse,
-                                                 self.subm)
-        else:
-            if self.subm:
-                out_features = Fsp.indice_subm_conv(features, self.weight,
-                                                    indice_pairs.to(device),
-                                                    indice_pair_num,
-                                                    outids.shape[0])
-            else:
-                if self.inverse:
-                    out_features = Fsp.indice_inverse_conv(
-                        features, self.weight, indice_pairs.to(device),
-                        indice_pair_num, outids.shape[0])
-                else:
-                    out_features = Fsp.indice_conv(features, self.weight,
-                                                   indice_pairs.to(device),
-                                                   indice_pair_num,
-                                                   outids.shape[0])
-            if self.bias is not None:
-                out_features += self.bias
-        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
-                                      batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
-        return out_tensor
-@CONV_LAYERS.register_module(force=True)
-class SparseConv2d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConv2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SparseConv3d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConv3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SparseConv4d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConv4d, self).__init__(
-            4,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SparseConvTranspose2d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConvTranspose2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            transposed=True,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SparseConvTranspose3d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SparseConvTranspose3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            transposed=True,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SparseInverseConv2d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 indice_key,
-                 bias=True):
-        super(SparseInverseConv2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            bias=bias,
-            inverse=True,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SparseInverseConv3d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 indice_key,
-                 bias=True):
-        super(SparseInverseConv3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            bias=bias,
-            inverse=True,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SubMConv2d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SubMConv2d, self).__init__(
-            2,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SubMConv3d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SubMConv3d, self).__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
-@CONV_LAYERS.register_module(force=True)
-class SubMConv4d(SparseConvolution):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 indice_key=None):
-        super(SubMConv4d, self).__init__(
-            4,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            True,
-            indice_key=indice_key)
--- a/mmdet3d/ops/spconv/functional.py
+++ b/mmdet3d/ops/spconv/functional.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from torch.autograd import Function
-from . import ops as ops
-class SparseConvFunction(Function):
-    @staticmethod
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out):
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, False)
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            False)
-        return input_bp, filters_bp, None, None, None
-class SparseInverseConvFunction(Function):
-    @staticmethod
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out):
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, True, False)
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            True, False)
-        return input_bp, filters_bp, None, None, None
-class SubMConvFunction(Function):
-    @staticmethod
-    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
-                num_activate_out):
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
-        return ops.indice_conv(features, filters, indice_pairs,
-                               indice_pair_num, num_activate_out, False, True)
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
-        input_bp, filters_bp = ops.indice_conv_backward(
-            features, filters, grad_output, indice_pairs, indice_pair_num,
-            False, True)
-        return input_bp, filters_bp, None, None, None
-class SparseMaxPoolFunction(Function):
-    @staticmethod
-    def forward(ctx, features, indice_pairs, indice_pair_num,
-                num_activate_out):
-        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
-                                 num_activate_out)
-        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
-        return out
-    @staticmethod
-    def backward(ctx, grad_output):
-        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
-        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
-                                               indice_pairs, indice_pair_num)
-        return input_bp, None, None, None
-indice_conv = SparseConvFunction.apply
-indice_inverse_conv = SparseInverseConvFunction.apply
-indice_subm_conv = SubMConvFunction.apply
-indice_maxpool = SparseMaxPoolFunction.apply
--- a/mmdet3d/ops/spconv/include/paramsgrid.h
+++ b/mmdet3d/ops/spconv/include/paramsgrid.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-namespace detail {
-template <class T>
-int getTotalSize(std::vector<T> arg) {
-  return arg.size();
-}
-template <class T, class... TArgs>
-int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-template <typename T>
-int getSize(std::vector<T> arg) {
-  return arg.size();
-}
-template <int Idx, class TT, class T>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-template <int Idx, class TT, class T, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-}  // namespace detail
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-#endif
--- a/mmdet3d/ops/spconv/include/prettyprint.h
+++ b/mmdet3d/ops/spconv/include/prettyprint.h
-//          Copyright Louis Delacroix 2010 - 2014.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// A pretty printing library for C++
-//
-// Usage:
-// Include this header, and operator<< will "just work".
-#ifndef H_PRETTY_PRINT
-#define H_PRETTY_PRINT
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <tuple>
-#include <type_traits>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-namespace pretty_print {
-namespace detail {
-// SFINAE type trait to detect whether T::const_iterator exists.
-struct sfinae_base {
-  using yes = char;
-  using no = yes[2];
-};
-template <typename T>
-struct has_const_iterator : private sfinae_base {
- private:
-  template <typename C>
-  static yes &test(typename C::const_iterator *);
-  template <typename C>
-  static no &test(...);
- public:
-  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
-  using type = T;
-};
-template <typename T>
-struct has_begin_end : private sfinae_base {
- private:
-  template <typename C>
-  static yes &
-  f(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::begin)),
-                   typename C::const_iterator (C::*)() const>::value>::type *);
-  template <typename C>
-  static no &f(...);
-  template <typename C>
-  static yes &g(typename std::enable_if<
-                std::is_same<decltype(static_cast<typename C::const_iterator (
-                                          C::*)() const>(&C::end)),
-                             typename C::const_iterator (C::*)() const>::value,
-                void>::type *);
-  template <typename C>
-  static no &g(...);
- public:
-  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
-  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
-};
-}  // namespace detail
-// Holds the delimiter values for a specific character type
-template <typename TChar>
-struct delimiters_values {
-  using char_type = TChar;
-  const char_type *prefix;
-  const char_type *delimiter;
-  const char_type *postfix;
-};
-// Defines the delimiter values for a specific container and character type
-template <typename T, typename TChar>
-struct delimiters {
-  using type = delimiters_values<TChar>;
-  static const type values;
-};
-// Functor to print containers. You can use this directly if you want
-// to specify a non-default delimiters type. The printing logic can
-// be customized by specializing the nested template.
-template <typename T, typename TChar = char,
-          typename TCharTraits = ::std::char_traits<TChar>,
-          typename TDelimiters = delimiters<T, TChar>>
-struct print_container_helper {
-  using delimiters_type = TDelimiters;
-  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
-  template <typename U>
-  struct printer {
-    static void print_body(const U &c, ostream_type &stream) {
-      using std::begin;
-      using std::end;
-      auto it = begin(c);
-      const auto the_end = end(c);
-      if (it != the_end) {
-        for (;;) {
-          stream << *it;
-          if (++it == the_end) break;
-          if (delimiters_type::values.delimiter != NULL)
-            stream << delimiters_type::values.delimiter;
-        }
-      }
-    }
-  };
-  print_container_helper(const T &container) : container_(container) {}
-  inline void operator()(ostream_type &stream) const {
-    if (delimiters_type::values.prefix != NULL)
-      stream << delimiters_type::values.prefix;
-    printer<T>::print_body(container_, stream);
-    if (delimiters_type::values.postfix != NULL)
-      stream << delimiters_type::values.postfix;
-  }
- private:
-  const T &container_;
-};
-// Specialization for pairs
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename T1, typename T2>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::pair<T1, T2>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
-    stream << c.first;
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << c.second;
-  }
-};
-// Specialization for tuples
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename... Args>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::tuple<Args...>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  using element_type = std::tuple<Args...>;
-  template <std::size_t I>
-  struct Int {};
-  static void print_body(const element_type &c, ostream_type &stream) {
-    tuple_print(c, stream, Int<0>());
-  }
-  static void tuple_print(const element_type &, ostream_type &,
-                          Int<sizeof...(Args)>) {}
-  static void tuple_print(
-      const element_type &c, ostream_type &stream,
-      typename std::conditional<sizeof...(Args) != 0, Int<0>,
-                                std::nullptr_t>::type) {
-    stream << std::get<0>(c);
-    tuple_print(c, stream, Int<1>());
-  }
-  template <std::size_t N>
-  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << std::get<N>(c);
-    tuple_print(c, stream, Int<N + 1>());
-  }
-};
-// Prints a print_container_helper to the specified stream.
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &stream,
-    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
-  helper(stream);
-  return stream;
-}
-// Basic is_container template; specialize to derive from std::true_type for all
-// desired container types
-template <typename T>
-struct is_container
-    : public std::integral_constant<bool,
-                                    detail::has_const_iterator<T>::value &&
-                                        detail::has_begin_end<T>::beg_value &&
-                                        detail::has_begin_end<T>::end_value> {};
-template <typename T, std::size_t N>
-struct is_container<T[N]> : std::true_type {};
-template <std::size_t N>
-struct is_container<char[N]> : std::false_type {};
-template <typename T>
-struct is_container<std::valarray<T>> : std::true_type {};
-template <typename T1, typename T2>
-struct is_container<std::pair<T1, T2>> : std::true_type {};
-template <typename... Args>
-struct is_container<std::tuple<Args...>> : std::true_type {};
-// Default delimiters
-template <typename T>
-struct delimiters<T, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T>
-const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
-template <typename T>
-struct delimiters<T, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T>
-const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
-                                                                   L"]"};
-// Delimiters for (multi)set and unordered_(multi)set
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
-                                                                  "}"};
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
-        "{", ", ", "}"};
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
-    L"{", L", ", L"}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  char> {
-  static const delimiters_values<char> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-               wchar_t>::values = {L"{", L", ", L"}"};
-// Delimiters for pair and tuple
-template <typename T1, typename T2>
-struct delimiters<std::pair<T1, T2>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
-    "(", ", ", ")"};
-template <typename T1, typename T2>
-struct delimiters<::std::pair<T1, T2>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<wchar_t>
-    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
-template <typename... Args>
-struct delimiters<std::tuple<Args...>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename... Args>
-const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
-    "(", ", ", ")"};
-template <typename... Args>
-struct delimiters<::std::tuple<Args...>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename... Args>
-const delimiters_values<wchar_t>
-    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
-// Type-erasing helper class for easy use of custom delimiters.
-// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
-// and MyDelims needs to be defined for TChar. Usage: "cout <<
-// pretty_print::custom_delims<MyDelims>(x)".
-struct custom_delims_base {
-  virtual ~custom_delims_base() {}
-  virtual std::ostream &stream(::std::ostream &) = 0;
-  virtual std::wostream &stream(::std::wostream &) = 0;
-};
-template <typename T, typename Delims>
-struct custom_delims_wrapper : custom_delims_base {
-  custom_delims_wrapper(const T &t_) : t(t_) {}
-  std::ostream &stream(std::ostream &s) {
-    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
-               t);
-  }
-  std::wostream &stream(std::wostream &s) {
-    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
-                                       Delims>(t);
-  }
- private:
-  const T &t;
-};
-template <typename Delims>
-struct custom_delims {
-  template <typename Container>
-  custom_delims(const Container &c)
-      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
-  std::unique_ptr<custom_delims_base> base;
-};
-template <typename TChar, typename TCharTraits, typename Delims>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
-  return p.base->stream(s);
-}
-// A wrapper for a C-style array given as pointer-plus-size.
-// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
-template <typename T>
-struct array_wrapper_n {
-  typedef const T *const_iterator;
-  typedef T value_type;
-  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
-  inline const_iterator begin() const { return _array; }
-  inline const_iterator end() const { return _array + _n; }
- private:
-  const T *const _array;
-  size_t _n;
-};
-// A wrapper for hash-table based containers that offer local iterators to each
-// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
-// 5 of container m.)
-template <typename T>
-struct bucket_print_wrapper {
-  typedef typename T::const_local_iterator const_iterator;
-  typedef typename T::size_type size_type;
-  const_iterator begin() const { return m_map.cbegin(n); }
-  const_iterator end() const { return m_map.cend(n); }
-  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
- private:
-  const T &m_map;
-  const size_type n;
-};
-}  // namespace pretty_print
-// Global accessor functions for the convenience wrappers
-template <typename T>
-inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
-                                                           size_t n) {
-  return pretty_print::array_wrapper_n<T>(a, n);
-}
-template <typename T>
-pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
-                                                   typename T::size_type n) {
-  return pretty_print::bucket_print_wrapper<T>(m, n);
-}
-// Main magic entry point: An overload snuck into namespace std.
-// Can we do better?
-namespace std {
-// Prints a container to the stream using default delimiters
-template <typename T, typename TChar, typename TCharTraits>
-inline typename enable_if<::pretty_print::is_container<T>::value,
-                          basic_ostream<TChar, TCharTraits> &>::type
-operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
-  return stream
-         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
-                container);
-}
-}  // namespace std
-#endif  // H_PRETTY_PRINT
--- a/mmdet3d/ops/spconv/include/pybind11_utils.h
+++ b/mmdet3d/ops/spconv/include/pybind11_utils.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <pybind11/embed.h> // everything needed for embedding
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <tensorview/tensorview.h>
-namespace py = pybind11;
-template <typename T, typename TPyObject>
-std::vector<T> array2Vector(TPyObject arr){
-    py::array arr_np = arr;
-    size_t size = arr.attr("size").template cast<size_t>();
-    py::array_t<T> arr_cc = arr_np;
-    std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
-    return data;
-}
-template <typename T>
-std::vector<T> arrayT2Vector(py::array_t<T> arr)
-{
-  std::vector<T> data(arr.data(), arr.data() + arr.size());
-  return data;
-}
-template <typename T, typename TPyObject>
-tv::TensorView<T> array2TensorView(TPyObject arr){
-    py::array arr_np = arr;
-    py::array_t<T> arr_cc = arr_np;
-    tv::Shape shape;
-    for (int i = 0; i < arr_cc.ndim(); ++i){
-        shape.push_back(arr_cc.shape(i));
-    }
-    return tv::TensorView<T>(arr_cc.mutable_data(), shape);
-}
-template <typename T>
-tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
-    tv::Shape shape;
-    for (int i = 0; i < arr.ndim(); ++i){
-        shape.push_back(arr.shape(i));
-    }
-    return tv::TensorView<T>(arr.mutable_data(), shape);
-}
--- a/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
+++ b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-#include <cuda_runtime_api.h>
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <torch/script.h>
-#include <torch_utils.h>
-#include <utility/timer.h>
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-template <typename T>
-torch::Tensor fusedIndiceConvBatchNorm(
-    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
-    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
-    int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) {  // the center index of subm conv don't need gather and scatter
-               // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
-                                            {nHot, numInPlanes}, options);
-    if (device == torch::kCPU) {
-      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
-      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-    } else {
-      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
-      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
-                 tv::torch2tv<const T>(features),
-                 tv::torch2tv<const int>(indicePairs).subview(i, inverse),
-                 nHot);
-      TV_CHECK_CUDA_ERR();
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-    if (device == torch::kCPU) {
-      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
-      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-    } else {
-      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
-      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
-                  tv::torch2tv<const T>(outputBuffer),
-                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
-                  nHot, true);
-      TV_CHECK_CUDA_ERR();
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-}  // namespace spconv
-#endif