Bump version to V1.0.0rc0

Bump version to V1.0.0rc0

Bump version to V1.0.0rc0
32a4328b · Wenwei Zhang · GitHub · 86cc487c · a8817998 · 32a4328b
Unverified Commit 32a4328b authored Feb 24, 2022 by Wenwei Zhang Committed by GitHub Feb 24, 2022
20 changed files
--- a/mmdet3d/ops/norm.py
+++ b/mmdet3d/ops/norm.py
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from mmcv.cnn import NORM_LAYERS
 from mmcv.runner import force_fp32
@@ -26,7 +27,7 @@ class AllReduce(Function):

 @NORM_LAYERS.register_module('naiveSyncBN1d')
 class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
-    """Syncronized Batch Normalization for 3D Tensors.
+    """Synchronized Batch Normalization for 3D Tensors.

    Note:
        This implementation is modified from
@@ -37,7 +38,7 @@ class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
        when the batch size on each worker is quite different
        (e.g., when scale augmentation is used).
        In 3D detection, different workers has points of different shapes,
-        whish also cause instability.
+        which also cause instability.

        Use this implementation before `nn.SyncBatchNorm` is fixed.
        It is slower than `nn.SyncBatchNorm`.
@@ -80,7 +81,7 @@ class NaiveSyncBatchNorm1d(nn.BatchNorm1d):

 @NORM_LAYERS.register_module('naiveSyncBN2d')
 class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
-    """Syncronized Batch Normalization for 4D Tensors.
+    """Synchronized Batch Normalization for 4D Tensors.

    Note:
        This implementation is modified from

--- a/mmdet3d/ops/paconv/__init__.py
+++ b/mmdet3d/ops/paconv/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
 from .assign_score import assign_score_withk
 from .paconv import PAConv, PAConvCUDA


--- a/mmdet3d/ops/paconv/assign_score.py
+++ b/mmdet3d/ops/paconv/assign_score.py
+# Copyright (c) OpenMMLab. All rights reserved.
 from torch.autograd import Function

 from . import assign_score_withk_ext

--- a/mmdet3d/ops/paconv/paconv.py
+++ b/mmdet3d/ops/paconv/paconv.py
+# Copyright (c) OpenMMLab. All rights reserved.
 import copy
+
 import torch
 from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer,
                      constant_init)
@@ -83,7 +85,7 @@ class ScoreNet(nn.Module):
        Args:
            xyz_features (torch.Tensor): (B, C, N, K), features constructed
                from xyz coordinates of point pairs. May contain relative
-                positions, Euclidian distance, etc.
+                positions, Euclidean distance, etc.

        Returns:
            torch.Tensor: (B, N, K, M), predicted scores for `M` kernels.
@@ -174,7 +176,7 @@ class PAConv(nn.Module):
            # (grouped_xyz - center_xyz, grouped_xyz)
            self.scorenet_in_channels = 6
        elif scorenet_input == 'w_neighbor_dist':
-            # (center_xyz, grouped_xyz - center_xyz, Euclidian distance)
+            # (center_xyz, grouped_xyz - center_xyz, Euclidean distance)
            self.scorenet_in_channels = 7
        else:
            raise NotImplementedError(

--- a/mmdet3d/ops/paconv/utils.py
+++ b/mmdet3d/ops/paconv/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch


 def calc_euclidian_dist(xyz1, xyz2):
-    """Calculate the Euclidian distance between two sets of points.
+    """Calculate the Euclidean distance between two sets of points.

    Args:
        xyz1 (torch.Tensor): (N, 3), the first set of points.
        xyz2 (torch.Tensor): (N, 3), the second set of points.

    Returns:
-        torch.Tensor: (N, ), the Euclidian distance between each point pair.
+        torch.Tensor: (N, ), the Euclidean distance between each point pair.
    """
    assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same'
    assert xyz1.shape[1] == xyz2.shape[1] == 3, \

--- a/mmdet3d/ops/pointnet_modules/paconv_sa_module.py
+++ b/mmdet3d/ops/pointnet_modules/paconv_sa_module.py
@@ -28,7 +28,7 @@ class PAConvSAModuleMSG(BasePointSAModule):
            - 'w_neighbor': Use xyz coordinates and the difference with center
                points as input.
            - 'w_neighbor_dist': Use xyz coordinates, the difference with
-                center points and the Euclidian distance as input.
+                center points and the Euclidean distance as input.

        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
            may contain the following keys and values:
@@ -239,11 +239,12 @@ class PAConvCUDASAModuleMSG(BasePointSAModule):

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) features of each point.
+            features (Tensor, optional): (B, C, N) features of each point.
                Default: None.
-            indices (Tensor): (B, num_point) Index of the features.
+            indices (Tensor, optional): (B, num_point) Index of the features.
+                Default: None.
+            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
                Default: None.
-            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.

        Returns:
            Tensor: (B, M, 3) where M is the number of points.

--- a/mmdet3d/ops/pointnet_modules/point_fp_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_fp_module.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
 import torch
 from mmcv.cnn import ConvModule
 from mmcv.runner import BaseModule, force_fp32
 from torch import nn as nn
-from typing import List

 from mmdet3d.ops import three_interpolate, three_nn

@@ -15,7 +16,7 @@ class PointFPModule(BaseModule):

    Args:
        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict): Type of normalization method.
+        norm_cfg (dict, optional): Type of normalization method.
            Default: dict(type='BN2d').
    """


--- a/mmdet3d/ops/pointnet_modules/point_sa_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_sa_module.py
@@ -18,25 +18,25 @@ class BasePointSAModule(nn.Module):
        sample_nums (list[int]): Number of samples in each ball query.
        mlp_channels (list[list[int]]): Specify of the pointnet before
            the global pooling for each scale.
-        fps_mod (list[str]: Type of FPS method, valid mod
+        fps_mod (list[str], optional): Type of FPS method, valid mod
            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
            F-FPS: using feature distances for FPS.
            D-FPS: using Euclidean distances of points for FPS.
            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int]): Range of points to apply FPS.
-            Default: [-1].
-        dilated_group (bool): Whether to use dilated ball query.
+        fps_sample_range_list (list[int], optional):
+            Range of points to apply FPS. Default: [-1].
+        dilated_group (bool, optional): Whether to use dilated ball query.
            Default: False.
-        use_xyz (bool): Whether to use xyz.
+        use_xyz (bool, optional): Whether to use xyz.
            Default: True.
-        pool_mod (str): Type of pooling method.
+        pool_mod (str, optional): Type of pooling method.
            Default: 'max_pool'.
-        normalize_xyz (bool): Whether to normalize local XYZ with radius.
-            Default: False.
-        grouper_return_grouped_xyz (bool): Whether to return grouped xyz in
-            `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool): Whether to return grouped idx in
-            `QueryAndGroup`. Defaults to False.
+        normalize_xyz (bool, optional): Whether to normalize local XYZ
+            with radius. Default: False.
+        grouper_return_grouped_xyz (bool, optional): Whether to return
+            grouped xyz in `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool, optional): Whether to return
+            grouped idx in `QueryAndGroup`. Defaults to False.
    """

    def __init__(self,
@@ -69,6 +69,8 @@ class BasePointSAModule(nn.Module):
            self.num_point = [num_point]
        elif isinstance(num_point, list) or isinstance(num_point, tuple):
            self.num_point = num_point
+        elif num_point is None:
+            self.num_point = None
        else:
            raise NotImplementedError('Error type of num_point!')

@@ -78,8 +80,12 @@ class BasePointSAModule(nn.Module):
        self.fps_mod_list = fps_mod
        self.fps_sample_range_list = fps_sample_range_list

-        self.points_sampler = Points_Sampler(self.num_point, self.fps_mod_list,
-                                             self.fps_sample_range_list)
+        if self.num_point is not None:
+            self.points_sampler = Points_Sampler(self.num_point,
+                                                 self.fps_mod_list,
+                                                 self.fps_sample_range_list)
+        else:
+            self.points_sampler = None

        for i in range(len(radii)):
            radius = radii[i]
@@ -111,9 +117,7 @@ class BasePointSAModule(nn.Module):
        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
            features (Tensor): (B, C, N) features of each point.
-                Default: None.
            indices (Tensor): (B, num_point) Index of the features.
-                Default: None.
            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.

        Returns:
@@ -128,9 +132,12 @@ class BasePointSAModule(nn.Module):
        elif target_xyz is not None:
            new_xyz = target_xyz.contiguous()
        else:
-            indices = self.points_sampler(points_xyz, features)
-            new_xyz = gather_points(xyz_flipped, indices).transpose(
-                1, 2).contiguous() if self.num_point is not None else None
+            if self.num_point is not None:
+                indices = self.points_sampler(points_xyz, features)
+                new_xyz = gather_points(xyz_flipped,
+                                        indices).transpose(1, 2).contiguous()
+            else:
+                new_xyz = None

        return new_xyz, indices

@@ -169,11 +176,12 @@ class BasePointSAModule(nn.Module):

        Args:
            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) features of each point.
+            features (Tensor, optional): (B, C, N) features of each point.
                Default: None.
-            indices (Tensor): (B, num_point) Index of the features.
+            indices (Tensor, optional): (B, num_point) Index of the features.
+                Default: None.
+            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
                Default: None.
-            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.

        Returns:
            Tensor: (B, M, 3) where M is the number of points.
@@ -223,26 +231,26 @@ class PointSAModuleMSG(BasePointSAModule):
        sample_nums (list[int]): Number of samples in each ball query.
        mlp_channels (list[list[int]]): Specify of the pointnet before
            the global pooling for each scale.
-        fps_mod (list[str]: Type of FPS method, valid mod
+        fps_mod (list[str], optional): Type of FPS method, valid mod
            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
            F-FPS: using feature distances for FPS.
            D-FPS: using Euclidean distances of points for FPS.
            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int]): Range of points to apply FPS.
-            Default: [-1].
-        dilated_group (bool): Whether to use dilated ball query.
+        fps_sample_range_list (list[int], optional): Range of points to
+            apply FPS. Default: [-1].
+        dilated_group (bool, optional): Whether to use dilated ball query.
            Default: False.
-        norm_cfg (dict): Type of normalization method.
+        norm_cfg (dict, optional): Type of normalization method.
            Default: dict(type='BN2d').
-        use_xyz (bool): Whether to use xyz.
+        use_xyz (bool, optional): Whether to use xyz.
            Default: True.
-        pool_mod (str): Type of pooling method.
+        pool_mod (str, optional): Type of pooling method.
            Default: 'max_pool'.
-        normalize_xyz (bool): Whether to normalize local XYZ with radius.
-            Default: False.
-        bias (bool | str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
-            False. Default: "auto".
+        normalize_xyz (bool, optional): Whether to normalize local XYZ
+            with radius. Default: False.
+        bias (bool | str, optional): If specified as `auto`, it will be
+            decided by `norm_cfg`. `bias` will be set as True if
+            `norm_cfg` is None, otherwise False. Default: 'auto'.
    """

    def __init__(self,
@@ -298,24 +306,24 @@ class PointSAModule(PointSAModuleMSG):
    Args:
        mlp_channels (list[int]): Specify of the pointnet before
            the global pooling for each scale.
-        num_point (int): Number of points.
+        num_point (int, optional): Number of points.
            Default: None.
-        radius (float): Radius to group with.
+        radius (float, optional): Radius to group with.
            Default: None.
-        num_sample (int): Number of samples in each ball query.
+        num_sample (int, optional): Number of samples in each ball query.
            Default: None.
-        norm_cfg (dict): Type of normalization method.
+        norm_cfg (dict, optional): Type of normalization method.
            Default: dict(type='BN2d').
-        use_xyz (bool): Whether to use xyz.
+        use_xyz (bool, optional): Whether to use xyz.
            Default: True.
-        pool_mod (str): Type of pooling method.
+        pool_mod (str, optional): Type of pooling method.
            Default: 'max_pool'.
-        fps_mod (list[str]: Type of FPS method, valid mod
+        fps_mod (list[str], optional): Type of FPS method, valid mod
            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-        fps_sample_range_list (list[int]): Range of points to apply FPS.
-            Default: [-1].
-        normalize_xyz (bool): Whether to normalize local XYZ with radius.
-            Default: False.
+        fps_sample_range_list (list[int], optional): Range of points
+            to apply FPS. Default: [-1].
+        normalize_xyz (bool, optional): Whether to normalize local XYZ
+            with radius. Default: False.
    """

    def __init__(self,

--- a/mmdet3d/ops/roiaware_pool3d/__init__.py
+++ b/mmdet3d/ops/roiaware_pool3d/__init__.py
-from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu,
-                              points_in_boxes_gpu)
+# Copyright (c) OpenMMLab. All rights reserved.
+from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+                              points_in_boxes_part)
 from .roiaware_pool3d import RoIAwarePool3d

 __all__ = [
-    'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
-    'points_in_boxes_batch'
+    'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu',
+    'points_in_boxes_all'
 ]
--- a/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+++ b/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch

 from . import roiaware_pool3d_ext


-def points_in_boxes_gpu(points, boxes):
-    """Find points that are in boxes (CUDA)
+def points_in_boxes_part(points, boxes):
+    """Find the box in which each point is (CUDA).

    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
-            (x, y, z) is the bottom center
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center

    Returns:
        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
    """
-    assert boxes.shape[0] == points.shape[0], \
+    assert points.shape[0] == boxes.shape[0], \
        f'Points and boxes should have the same batch size, ' \
-        f'got {boxes.shape[0]} and {boxes.shape[0]}'
+        f'got {points.shape[0]} and {boxes.shape[0]}'
    assert boxes.shape[2] == 7, \
        f'boxes dimension should be 7, ' \
        f'got unexpected shape {boxes.shape[2]}'
@@ -43,56 +44,61 @@ def points_in_boxes_gpu(points, boxes):
    if torch.cuda.current_device() != points_device:
        torch.cuda.set_device(points_device)

-    roiaware_pool3d_ext.points_in_boxes_gpu(boxes.contiguous(),
-                                            points.contiguous(),
-                                            box_idxs_of_pts)
+    roiaware_pool3d_ext.points_in_boxes_part(boxes.contiguous(),
+                                             points.contiguous(),
+                                             box_idxs_of_pts)

    return box_idxs_of_pts


 def points_in_boxes_cpu(points, boxes):
-    """Find points that are in boxes (CPU)
-
-    Note:
-        Currently, the output of this function is different from that of
-        points_in_boxes_gpu.
+    """Find all boxes in which each point is (CPU). The CPU version of
+    :meth:`points_in_boxes_all`.

    Args:
-        points (torch.Tensor): [npoints, 3]
-        boxes (torch.Tensor): [N, 7], in LiDAR coordinate,
-            (x, y, z) is the bottom center
+        points (torch.Tensor): [B, M, 3], [x, y, z] in
+            LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.

    Returns:
-        point_indices (torch.Tensor): (N, npoints)
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
    """
-    # TODO: Refactor this function as a CPU version of points_in_boxes_gpu
-    assert boxes.shape[1] == 7, \
+    assert points.shape[0] == boxes.shape[0], \
+        f'Points and boxes should have the same batch size, ' \
+        f'got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
        f'boxes dimension should be 7, ' \
        f'got unexpected shape {boxes.shape[2]}'
-    assert points.shape[1] == 3, \
+    assert points.shape[2] == 3, \
        f'points dimension should be 3, ' \
        f'got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]

-    point_indices = points.new_zeros((boxes.shape[0], points.shape[0]),
+    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
                                     dtype=torch.int)
-    roiaware_pool3d_ext.points_in_boxes_cpu(boxes.float().contiguous(),
-                                            points.float().contiguous(),
-                                            point_indices)
+    for b in range(batch_size):
+        roiaware_pool3d_ext.points_in_boxes_cpu(boxes[b].float().contiguous(),
+                                                points[b].float().contiguous(),
+                                                point_indices[b])
+    point_indices = point_indices.transpose(1, 2)

    return point_indices


-def points_in_boxes_batch(points, boxes):
-    """Find points that are in boxes (CUDA)
+def points_in_boxes_all(points, boxes):
+    """Find all boxes in which each point is (CUDA).

    Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
        boxes (torch.Tensor): [B, T, 7],
-            num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
            (x, y, z) is the bottom center.

    Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
    """
    assert boxes.shape[0] == points.shape[0], \
        f'Points and boxes should have the same batch size, ' \
@@ -116,8 +122,8 @@ def points_in_boxes_batch(points, boxes):
    if torch.cuda.current_device() != points_device:
        torch.cuda.set_device(points_device)

-    roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(),
-                                              points.contiguous(),
-                                              box_idxs_of_pts)
+    roiaware_pool3d_ext.points_in_boxes_all(boxes.contiguous(),
+                                            points.contiguous(),
+                                            box_idxs_of_pts)

    return box_idxs_of_pts
--- a/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
+++ b/mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
 import mmcv
 import torch
 from torch import nn as nn

--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
@@ -15,9 +15,7 @@

 inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
                                      float &local_x, float &local_y) {
-  // should rotate pi/2 + alpha to translate LiDAR to local
-  float rot_angle = rz + M_PI / 2;
-  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  float cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
 }
@@ -25,23 +23,23 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
 inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
                                 float &local_x, float &local_y) {
  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
  // bottom center
  float x = pt[0], y = pt[1], z = pt[2];
  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
-  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center

-  if (fabsf(z - cz) > h / 2.0) return 0;
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
-                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
 }

 int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
                        at::Tensor pts_indices_tensor) {
-  // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is the
  // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
  // in LiDAR coordinate params pts_indices: (N, npoints)


--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
@@ -24,9 +24,7 @@
 __device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
                                             float rz, float &local_x,
                                             float &local_y) {
-  // should rotate pi/2 + alpha to translate LiDAR to local
-  float rot_angle = rz + M_PI / 2;
-  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  float cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
 }
@@ -34,25 +32,25 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
 __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
                                        float &local_x, float &local_y) {
  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
  // bottom center
  float x = pt[0], y = pt[1], z = pt[2];
  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
-  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center

-  if (fabsf(z - cz) > h / 2.0) return 0;
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
-                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
 }

-__global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
-                                       int pts_num, const float *boxes,
-                                       const float *pts,
-                                       int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+__global__ void points_in_boxes_part_kernel(int batch_size, int boxes_num,
+                                            int pts_num, const float *boxes,
+                                            const float *pts,
+                                            int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
  // -1
@@ -76,11 +74,11 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
  }
 }

-__global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num,
-                                             int pts_num, const float *boxes,
-                                             const float *pts,
-                                             int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+__global__ void points_in_boxes_all_kernel(int batch_size, int boxes_num,
+                                           int pts_num, const float *boxes,
+                                           const float *pts,
+                                           int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
  // -1
@@ -104,10 +102,10 @@ __global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num,
  }
 }

-void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
-                              const float *boxes, const float *pts,
-                              int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+void points_in_boxes_part_launcher(int batch_size, int boxes_num, int pts_num,
+                                   const float *boxes, const float *pts,
+                                   int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
  // -1
@@ -115,8 +113,8 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,

  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
-                                              boxes, pts, box_idx_of_points);
+  points_in_boxes_part_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
+                                                   boxes, pts, box_idx_of_points);

  err = cudaGetLastError();
  if (cudaSuccess != err) {
@@ -129,17 +127,17 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
 #endif
 }

-void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num,
-                                    const float *boxes, const float *pts,
-                                    int *box_idx_of_points) {
-  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+void points_in_boxes_all_launcher(int batch_size, int boxes_num, int pts_num,
+                                  const float *boxes, const float *pts,
+                                  int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
  // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
  // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
  cudaError_t err;

  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
  dim3 threads(THREADS_PER_BLOCK);
-  points_in_boxes_batch_kernel<<<blocks, threads>>>(
+  points_in_boxes_all_kernel<<<blocks, threads>>>(
      batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);

  err = cudaGetLastError();
@@ -153,9 +151,9 @@ void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num,
 #endif
 }

-int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor) {
-  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                         at::Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
  // -1
@@ -172,15 +170,15 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
  const float *pts = pts_tensor.data_ptr<float>();
  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();

-  points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts,
-                           box_idx_of_points);
+  points_in_boxes_part_launcher(batch_size, boxes_num, pts_num, boxes, pts,
+                                box_idx_of_points);

  return 1;
 }

-int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                          at::Tensor box_idx_of_points_tensor) {
-  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
  // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
  // coordinate params boxes_idx_of_points: (B, npoints), default -1

@@ -196,8 +194,8 @@ int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
  const float *pts = pts_tensor.data_ptr<float>();
  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();

-  points_in_boxes_batch_launcher(batch_size, boxes_num, pts_num, boxes, pts,
-                                 box_idx_of_points);
+  points_in_boxes_all_launcher(batch_size, boxes_num, pts_num, boxes, pts,
+                               box_idx_of_points);

  return 1;
 }
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
@@ -40,16 +40,16 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
 int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
                        at::Tensor pts_indices_tensor);

-int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                        at::Tensor box_idx_of_points_tensor);
+int points_in_boxes_part(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                         at::Tensor box_idx_of_points_tensor);

-int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
-                          at::Tensor box_idx_of_points_tensor);
+int points_in_boxes_all(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor box_idx_of_points_tensor);

 int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
                        at::Tensor pooled_features, int pool_method) {
-  // params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coordinate
  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
  // params pts_feature: (npoints, C)
  // params argmax: (N, out_x, out_y, out_z, C)
@@ -127,10 +127,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
  m.def("backward", &roiaware_pool3d_gpu_backward,
        "roiaware pool3d backward (CUDA)");
-  m.def("points_in_boxes_gpu", &points_in_boxes_gpu,
-        "points_in_boxes_gpu forward (CUDA)");
-  m.def("points_in_boxes_batch", &points_in_boxes_batch,
-        "points_in_boxes_batch forward (CUDA)");
+  m.def("points_in_boxes_part", &points_in_boxes_part,
+        "points_in_boxes_part forward (CUDA)");
+  m.def("points_in_boxes_all", &points_in_boxes_all,
+        "points_in_boxes_all forward (CUDA)");
  m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
        "points_in_boxes_cpu forward (CPU)");
 }
--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
@@ -17,9 +17,7 @@
 __device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
                                             float rz, float &local_x,
                                             float &local_y) {
-  // should rotate pi/2 + alpha to translate LiDAR to local
-  float rot_angle = rz + M_PI / 2;
-  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  float cosa = cos(-rz), sina = sin(-rz);
  local_x = shift_x * cosa + shift_y * (-sina);
  local_y = shift_x * sina + shift_y * cosa;
 }
@@ -27,17 +25,17 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
 __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
                                        float &local_x, float &local_y) {
  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
  // bottom center
  float x = pt[0], y = pt[1], z = pt[2];
  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
-  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size / 2.0;  // shift to the center since cz in box3d is the bottom center

-  if (fabsf(z - cz) > h / 2.0) return 0;
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
-                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
  return in_flag;
 }

@@ -45,9 +43,9 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
                                            int out_x, int out_y, int out_z,
                                            const float *rois, const float *pts,
                                            int *pts_mask) {
-  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
  // params pts: (npoints, 3) [x, y, z]
-  // params pts_mask: (N, npoints): -1 means point doesnot in this box,
+  // params pts_mask: (N, npoints): -1 means point does not in this box,
  // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int box_idx = blockIdx.y;
@@ -63,14 +61,14 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
  pts_mask[0] = -1;
  if (cur_in_flag > 0) {
    float local_z = pts[2] - rois[2];
-    float w = rois[3], l = rois[4], h = rois[5];
+    float x_size = rois[3], y_size = rois[4], z_size = rois[5];

-    float x_res = l / out_x;
-    float y_res = w / out_y;
-    float z_res = h / out_z;
+    float x_res = x_size / out_x;
+    float y_res = y_size / out_y;
+    float z_res = z_size / out_z;

-    unsigned int x_idx = int((local_x + l / 2) / x_res);
-    unsigned int y_idx = int((local_y + w / 2) / y_res);
+    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
    unsigned int z_idx = int(local_z / z_res);

    x_idx = min(max(x_idx, 0), out_x - 1);
@@ -231,7 +229,7 @@ void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
                              const float *pts_feature, int *argmax,
                              int *pts_idx_of_voxels, float *pooled_features,
                              int pool_method) {
-  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
  // params pts_feature: (npoints, C)
  // params argmax: (N, out_x, out_y, out_z, C)

--- a/mmdet3d/ops/roipoint_pool3d/__init__.py
+++ b/mmdet3d/ops/roipoint_pool3d/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .roipoint_pool3d import RoIPointPool3d
+
+__all__ = ['RoIPointPool3d']
--- a/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
+++ b/mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import nn as nn
+from torch.autograd import Function
+
+from . import roipoint_pool3d_ext
+
+
+class RoIPointPool3d(nn.Module):
+
+    def __init__(self, num_sampled_points=512):
+        super().__init__()
+        """
+        Args:
+            num_sampled_points (int): Number of samples in each roi
+        """
+        self.num_sampled_points = num_sampled_points
+
+    def forward(self, points, point_features, boxes3d):
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is BxNx3
+            point_features: (B, N, C)
+            boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
+
+        Returns:
+            torch.Tensor: (B, M, 512, 3 + C) pooled_features
+            torch.Tensor: (B, M) pooled_empty_flag
+        """
+        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
+                                            self.num_sampled_points)
+
+
+class RoIPointPool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, 3)
+            point_features (torch.Tensor): Input points features shape is \
+                (B, N, C)
+            boxes3d (torch.Tensor): Input bounding boxes whose shape is \
+                (B, M, 7)
+            num_sampled_points (int): the num of sampled points
+
+        Returns:
+            torch.Tensor: (B, M, 512, 3 + C) pooled_features
+            torch.Tensor: (B, M) pooled_empty_flag
+        """
+        assert points.shape.__len__() == 3 and points.shape[2] == 3
+        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
+            1], point_features.shape[2]
+        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
+        pooled_features = point_features.new_zeros(
+            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
+        pooled_empty_flag = point_features.new_zeros(
+            (batch_size, boxes_num)).int()
+
+        roipoint_pool3d_ext.forward(points.contiguous(),
+                                    pooled_boxes3d.contiguous(),
+                                    point_features.contiguous(),
+                                    pooled_features, pooled_empty_flag)
+
+        return pooled_features, pooled_empty_flag
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        raise NotImplementedError
+
+
+if __name__ == '__main__':
+    pass
--- a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+/*
+Modified for
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x) do { \
+  if (!x.type().is_cuda()) { \
+    fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+    exit(-1); \
+  } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+  if (!x.is_contiguous()) { \
+    fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+    exit(-1); \
+  } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+
+
+void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
+                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag);
+
+
+int roipool3d_gpu(at::Tensor xyz, at::Tensor boxes3d, at::Tensor pts_feature, at::Tensor pooled_features, at::Tensor pooled_empty_flag){
+    // params xyz: (B, N, 3)
+    // params boxes3d: (B, M, 7)
+    // params pts_feature: (B, N, C)
+    // params pooled_features: (B, M, 512, 3+C)
+    // params pooled_empty_flag: (B, M)
+    CHECK_INPUT(xyz);
+    CHECK_INPUT(boxes3d);
+    CHECK_INPUT(pts_feature);
+    CHECK_INPUT(pooled_features);
+    CHECK_INPUT(pooled_empty_flag);
+
+    int batch_size = xyz.size(0);
+    int pts_num = xyz.size(1);
+    int boxes_num = boxes3d.size(1);
+    int feature_in_len = pts_feature.size(2);
+    int sampled_pts_num = pooled_features.size(2);
+
+
+    const float * xyz_data = xyz.data<float>();
+    const float * boxes3d_data = boxes3d.data<float>();
+    const float * pts_feature_data = pts_feature.data<float>();
+    float * pooled_features_data = pooled_features.data<float>();
+    int * pooled_empty_flag_data = pooled_empty_flag.data<int>();
+
+    roipool3dLauncher(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+                       xyz_data, boxes3d_data, pts_feature_data, pooled_features_data, pooled_empty_flag_data);
+
+
+
+    return 1;
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &roipool3d_gpu, "roipool3d forward (CUDA)");
+}
--- a/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+++ b/mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+/*
+Modified from
+https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+// #define DEBUG
+
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
+                                             float rz, float &local_x,
+                                             float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
+                                        float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > dz / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
+                  (local_y > -dy / 2.0) & (local_y < dy / 2.0);
+  return in_flag;
+}
+
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){
+    // params xyz: (B, N, 3)
+    // params boxes3d: (B, M, 7)
+    // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int box_idx = blockIdx.y;
+    int bs_idx = blockIdx.z;
+
+    if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
+        return;
+    }
+    int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
+
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+
+    float local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+    // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
+}
+
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num,
+                               const int *pts_assign, int *pts_idx, int *pooled_empty_flag){
+    // params xyz: (B, N, 3)
+    // params pts_feature: (B, N, C)
+    // params pts_assign: (B, N)
+    // params pts_idx: (B, M, 512)
+    // params pooled_empty_flag: (B, M)
+
+    int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (boxes_idx >= boxes_num){
+        return;
+    }
+
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++){
+        if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){
+            if (cnt < sampled_pts_num){
+                pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k;
+                cnt++;
+            }
+            else break;
+        }
+    }
+
+    if (cnt == 0){
+        pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    }
+    else if (cnt < sampled_pts_num){
+        // duplicate same points for sampling
+        for (int k = cnt; k < sampled_pts_num; k++){
+            int duplicate_idx = k % cnt;
+            int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+            pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+        }
+    }
+}
+
+
+__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
+                                   const float *xyz, const int *pts_idx, const float *pts_feature,
+                                   float *pooled_features, int *pooled_empty_flag){
+    // params xyz: (B, N, 3)
+    // params pts_idx: (B, M, 512)
+    // params pts_feature: (B, N, C)
+    // params pooled_features: (B, M, 512, 3+C)
+    // params pooled_empty_flag: (B, M)
+
+    int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int box_idx = blockIdx.y;
+    int bs_idx = blockIdx.z;
+
+    if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){
+        return;
+    }
+
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){
+        return;
+    }
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+        pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    for (int j = 0; j < feature_in_len; j++)
+        pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j];
+}
+
+
+void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num,
+                       const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){
+
+    // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
+    int *pts_assign = NULL;
+    cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int));  // (batch_size, N, M)
+    // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
+
+    dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign);
+
+    int *pts_idx = NULL;
+    cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int));  // (batch_size, M, sampled_pts_num)
+
+    dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);  // blockIdx.x(col), blockIdx.y(row)
+    get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag);
+
+    dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+    roipool3d_forward<<<blocks_pool, threads>>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+                                                      xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
+
+    cudaFree(pts_assign);
+    cudaFree(pts_idx);
+
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
@@ -14,12 +14,12 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
    Args:
        inplanes (int): inplanes of block.
        planes (int): planes of block.
-        stride (int): stride of the first block. Default: 1
-        downsample (None | Module): down sample module for block.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
+        stride (int, optional): stride of the first block. Default: 1.
+        downsample (Module, optional): down sample module for block.
+        conv_cfg (dict, optional): dictionary to construct and config conv
+            layer. Default: None.
+        norm_cfg (dict, optional): dictionary to construct and config norm
+            layer. Default: dict(type='BN').
    """

    expansion = 4
@@ -73,12 +73,12 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule):
    Args:
        inplanes (int): inplanes of block.
        planes (int): planes of block.
-        stride (int): stride of the first block. Default: 1
-        downsample (None | Module): down sample module for block.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
+        stride (int, optional): stride of the first block. Default: 1.
+        downsample (Module, optional): down sample module for block.
+        conv_cfg (dict, optional): dictionary to construct and config conv
+            layer. Default: None.
+        norm_cfg (dict, optional): dictionary to construct and config norm
+            layer. Default: dict(type='BN').
    """

    expansion = 1