Merge branch 'master_temp' into indoor_pipeline

bce7d0c3 · yinchimaoliang · 1756485e · 868c5fab · bce7d0c3 · bce7d0c3
Commit bce7d0c3 authored May 12, 2020 by yinchimaoliang
20 changed files
--- a/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -93,7 +93,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=False,

--- a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
@@ -113,7 +113,8 @@ class_names = ['Pedestrian', 'Cyclist', 'Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=True,

--- a/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -91,7 +91,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=True,

--- a/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
+++ b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -90,7 +90,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=False,

--- a/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -89,7 +89,8 @@ class_names = ['Car']
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True,
+    use_lidar=False,
+    use_lidar_reduced=True,
    use_depth=False,
    use_lidar_intensity=True,
    use_camera=False,

--- a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
+++ b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
 import torch

-from mmdet3d.ops.iou3d import boxes_iou3d_gpu
+from mmdet3d.ops.iou3d import boxes_iou3d_gpu_camera, boxes_iou3d_gpu_lidar
 from mmdet.core.bbox import bbox_overlaps
 from mmdet.core.bbox.iou_calculators.builder import IOU_CALCULATORS
 from .. import box_torch_ops
@@ -22,10 +22,18 @@ class BboxOverlapsNearest3D(object):

 @IOU_CALCULATORS.register_module()
 class BboxOverlaps3D(object):
-    """3D IoU Calculator"""
+    """3D IoU Calculator

-    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
-        return bbox_overlaps_3d(bboxes1, bboxes2, mode, is_aligned)
+    Args:
+        coordinate (str): 'camera' or 'lidar' coordinate system
+    """
+
+    def __init__(self, coordinate):
+        assert coordinate in ['camera', 'lidar']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou'):
+        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)

    def __repr__(self):
        repr_str = self.__class__.__name__
@@ -62,7 +70,7 @@ def bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode='iou', is_aligned=False):
    return ret


-def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou'):
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
    """Calculate 3D IoU using cuda implementation

    Args:
@@ -70,6 +78,7 @@ def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou'):
        bboxes2: Tensor, shape (M, 7) [x, y, z, h, w, l, ry]
        mode: mode (str): "iou" (intersection over union) or
            iof (intersection over foreground).
+        coordinate (str): 'camera' or 'lidar' coordinate system

    Return:
        iou: (M, N) not support aligned mode currently
@@ -77,4 +86,11 @@ def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou'):
    # TODO: check the input dimension meanings,
    #  this is inconsistent with that in bbox_overlaps_nearest_3d
    assert bboxes1.size(-1) == bboxes2.size(-1) == 7
-    return boxes_iou3d_gpu(bboxes1, bboxes2, mode)
+    assert coordinate in ['camera', 'lidar']
+
+    if coordinate == 'camera':
+        return boxes_iou3d_gpu_camera(bboxes1, bboxes2, mode)
+    elif coordinate == 'lidar':
+        return boxes_iou3d_gpu_lidar(bboxes1, bboxes2, mode)
+    else:
+        raise NotImplementedError
--- a/mmdet3d/core/bbox/samplers/__init__.py
+++ b/mmdet3d/core/bbox/samplers/__init__.py
@@ -3,9 +3,10 @@ from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler,
                                      IoUBalancedNegSampler, OHEMSampler,
                                      PseudoSampler, RandomSampler,
                                      SamplingResult)
+from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler

 __all__ = [
    'BaseSampler', 'PseudoSampler', 'RandomSampler',
    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
-    'OHEMSampler', 'SamplingResult'
+    'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler'
 ]
--- a/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py
+++ b/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py
+import torch
+
+from mmdet.core.bbox.builder import BBOX_SAMPLERS
+from . import RandomSampler, SamplingResult
+
+
+@BBOX_SAMPLERS.register_module
+class IoUNegPiecewiseSampler(RandomSampler):
+    """IoU Piece-wise Sampling
+
+    Sampling negtive proposals according to a list of IoU thresholds.
+    The negtive proposals are divided into several pieces according
+    to `neg_iou_piece_thrs`. And the ratio of each piece is indicated
+    by `neg_piece_fractions`.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): the fraction of positive proposals.
+        neg_piece_fractions (list): a list contains fractions that indicates
+            the ratio of each piece of total negtive samplers.
+        neg_iou_piece_thrs (list): a list contains IoU thresholds that
+            indicate the upper bound of this piece.
+        neg_pos_ub (float): the total ratio to limit the upper bound
+            number of negtive samples
+        add_gt_as_proposals (bool): whether to add gt as proposals.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction=None,
+                 neg_piece_fractions=None,
+                 neg_iou_piece_thrs=None,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=False,
+                 return_iou=False):
+        super(IoUNegPiecewiseSampler,
+              self).__init__(num, pos_fraction, neg_pos_ub,
+                             add_gt_as_proposals)
+        assert isinstance(neg_piece_fractions, list)
+        assert len(neg_piece_fractions) == len(neg_iou_piece_thrs)
+        self.neg_piece_fractions = neg_piece_fractions
+        self.neg_iou_thr = neg_iou_piece_thrs
+        self.return_iou = return_iou
+        self.neg_piece_num = len(self.neg_piece_fractions)
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            neg_inds_choice = neg_inds.new_zeros([0])
+            extend_num = 0
+            max_overlaps = assign_result.max_overlaps[neg_inds]
+
+            for piece_inds in range(self.neg_piece_num):
+                if piece_inds == self.neg_piece_num - 1:  # for the last piece
+                    piece_expected_num = num_expected - len(neg_inds_choice)
+                    min_iou_thr = 0
+                else:
+                    # if the numbers of negative samplers in previous
+                    # pieces are less than the expected number, extend
+                    # the same number in the current piece.
+                    piece_expected_num = int(
+                        num_expected *
+                        self.neg_piece_fractions[piece_inds]) + extend_num
+                    min_iou_thr = self.neg_iou_thr[piece_inds + 1]
+                max_iou_thr = self.neg_iou_thr[piece_inds]
+                piece_neg_inds = torch.nonzero(
+                    (max_overlaps >= min_iou_thr)
+                    & (max_overlaps < max_iou_thr)).view(-1)
+
+                if len(piece_neg_inds) < piece_expected_num:
+                    neg_inds_choice = torch.cat(
+                        [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0)
+                    extend_num += piece_expected_num - len(piece_neg_inds)
+                else:
+                    piece_choice = self.random_choice(piece_neg_inds,
+                                                      piece_expected_num)
+                    neg_inds_choice = torch.cat(
+                        [neg_inds_choice, neg_inds[piece_choice]], dim=0)
+                    extend_num = 0
+            return neg_inds_choice
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        if self.return_iou:
+            # PartA2 needs iou score to regression.
+            sampling_result.iou = assign_result.max_overlaps[torch.cat(
+                [pos_inds, neg_inds])]
+            sampling_result.iou.detach_()
+
+        return sampling_result
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -184,6 +184,8 @@ class KittiDataset(torch_data.Dataset):
        if self.modality['use_depth'] and self.modality['use_lidar']:
            points = self.get_lidar_depth_reduced(sample_idx)
        elif self.modality['use_lidar']:
+            points = self.get_lidar(sample_idx)
+        elif self.modality['use_lidar_reduced']:
            points = self.get_lidar_reduced(sample_idx)
        elif self.modality['use_depth']:
            points = self.get_pure_depth_reduced(sample_idx)
@@ -238,8 +240,6 @@ class KittiDataset(torch_data.Dataset):
                                      axis=1).astype(np.float32)
        difficulty = annos['difficulty']
        # this change gt_bboxes_3d to velodyne coordinates
-        import pdb
-        pdb.set_trace()
        gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect,
                                                      Trv2c)
        # only center format is allowed. so we need to convert

--- a/mmdet3d/datasets/pipelines/data_augment_utils.py
+++ b/mmdet3d/datasets/pipelines/data_augment_utils.py
+import warnings
+
 import numba
 import numpy as np
+from numba.errors import NumbaPerformanceWarning

 from mmdet3d.core.bbox import box_np_ops

+warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
+

 @numba.njit
 def _rotation_box2d_jit_(corners, angle, rot_mat_T):

--- a/mmdet3d/ops/iou3d/__init__.py
+++ b/mmdet3d/ops/iou3d/__init__.py
-from .iou3d_utils import (boxes_iou3d_gpu, boxes_iou_bev, nms_gpu,
-                          nms_normal_gpu)
+from .iou3d_utils import (boxes_iou3d_gpu_camera, boxes_iou3d_gpu_lidar,
+                          boxes_iou_bev, nms_gpu, nms_normal_gpu)

-__all__ = ['boxes_iou_bev', 'boxes_iou3d_gpu', 'nms_gpu', 'nms_normal_gpu']
+__all__ = [
+    'boxes_iou_bev', 'boxes_iou3d_gpu_camera', 'nms_gpu', 'nms_normal_gpu',
+    'boxes_iou3d_gpu_lidar'
+]
--- a/mmdet3d/ops/iou3d/iou3d_utils.py
+++ b/mmdet3d/ops/iou3d/iou3d_utils.py
@@ -20,17 +20,22 @@ def boxes_iou_bev(boxes_a, boxes_b):
    return ans_iou


-def boxes_iou3d_gpu(boxes_a, boxes_b, mode='iou'):
-    """
-    :param boxes_a: (N, 7) [x, y, z, h, w, l, ry]
-    :param boxes_b: (M, 7) [x, y, z, h, w, l, ry]
-    :param mode  "iou" (intersection over union) or iof (intersection over
+def boxes_iou3d_gpu_camera(boxes_a, boxes_b, mode='iou'):
+    """Calculate 3d iou of boxes in camera coordinate
+
+    Args:
+        boxes_a (FloatTensor): (N, 7) [x, y, z, h, w, l, ry]
+            in LiDAR coordinate
+        boxes_b (FloatTensor): (M, 7) [x, y, z, h, w, l, ry]
+        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).
-    :return:
-        ans_iou: (M, N)
+
+    Returns:
+        FloatTensor: (M, N)
    """
-    boxes_a_bev = boxes3d_to_bev_torch(boxes_a)
-    boxes_b_bev = boxes3d_to_bev_torch(boxes_b)
+
+    boxes_a_bev = boxes3d_to_bev_torch_camera(boxes_a)
+    boxes_b_bev = boxes3d_to_bev_torch_camera(boxes_b)

    # bev overlap
    overlaps_bev = torch.cuda.FloatTensor(
@@ -51,15 +56,62 @@ def boxes_iou3d_gpu(boxes_a, boxes_b, mode='iou'):
    # 3d iou
    overlaps_3d = overlaps_bev * overlaps_h

-    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
-    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+    volume_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    volume_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)

    if mode == 'iou':
        # the clamp func is used to avoid division of 0
        iou3d = overlaps_3d / torch.clamp(
-            vol_a + vol_b - overlaps_3d, min=1e-8)
+            volume_a + volume_b - overlaps_3d, min=1e-8)
    else:
-        iou3d = overlaps_3d / torch.clamp(vol_a, min=1e-8)
+        iou3d = overlaps_3d / torch.clamp(volume_a, min=1e-8)
+
+    return iou3d
+
+
+def boxes_iou3d_gpu_lidar(boxes_a, boxes_b, mode='iou'):
+    """Calculate 3d iou of boxes in lidar coordinate
+
+    Args:
+        boxes_a (FloatTensor): (N, 7) [x, y, z, w, l, h, ry]
+            in LiDAR coordinate
+        boxes_b (FloatTensor): (M, 7) [x, y, z, w, l, h, ry]
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    :Returns:
+        FloatTensor: (M, N)
+    """
+    boxes_a_bev = boxes3d_to_bev_torch_lidar(boxes_a)
+    boxes_b_bev = boxes3d_to_bev_torch_lidar(boxes_b)
+    # height overlap
+    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5]).view(-1, 1)
+    boxes_a_height_min = boxes_a[:, 2].view(-1, 1)
+    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5]).view(1, -1)
+    boxes_b_height_min = boxes_b[:, 2].view(1, -1)
+
+    # bev overlap
+    overlaps_bev = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))  # (N, M)
+    iou3d_cuda.boxes_overlap_bev_gpu(boxes_a_bev.contiguous(),
+                                     boxes_b_bev.contiguous(), overlaps_bev)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+
+    # 3d iou
+    overlaps_3d = overlaps_bev * overlaps_h
+
+    volume_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    volume_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+
+    if mode == 'iou':
+        # the clamp func is used to avoid division of 0
+        iou3d = overlaps_3d / torch.clamp(
+            volume_a + volume_b - overlaps_3d, min=1e-8)
+    else:
+        iou3d = overlaps_3d / torch.clamp(volume_a, min=1e-8)

    return iou3d

@@ -98,16 +150,39 @@ def nms_normal_gpu(boxes, scores, thresh):
    return order[keep[:num_out].cuda()].contiguous()


-def boxes3d_to_bev_torch(boxes3d):
-    """
-    :param boxes3d: (N, 7) [x, y, z, h, w, l, ry] in camera coords
-    :return:
-        boxes_bev: (N, 5) [x1, y1, x2, y2, ry]
+def boxes3d_to_bev_torch_camera(boxes3d):
+    """covert boxes3d to bev in in camera coords
+
+    Args:
+        boxes3d (FloartTensor): (N, 7) [x, y, z, h, w, l, ry] in camera coords
+
+    Return:
+        FloartTensor: (N, 5) [x1, y1, x2, y2, ry]
    """
    boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5)))
+
    cu, cv = boxes3d[:, 0], boxes3d[:, 2]
    half_l, half_w = boxes3d[:, 5] / 2, boxes3d[:, 4] / 2
    boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_l, cv - half_w
    boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_l, cv + half_w
    boxes_bev[:, 4] = boxes3d[:, 6]
    return boxes_bev
+
+
+def boxes3d_to_bev_torch_lidar(boxes3d):
+    """covert boxes3d to bev in in LiDAR coords
+
+    Args:
+        boxes3d (FloartTensor): (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords
+
+    Returns:
+        FloartTensor: (N, 5) [x1, y1, x2, y2, ry]
+    """
+    boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5)))
+
+    x, y = boxes3d[:, 0], boxes3d[:, 1]
+    half_l, half_w = boxes3d[:, 4] / 2, boxes3d[:, 3] / 2
+    boxes_bev[:, 0], boxes_bev[:, 1] = x - half_w, y - half_l
+    boxes_bev[:, 2], boxes_bev[:, 3] = x + half_w, y + half_l
+    boxes_bev[:, 4] = boxes3d[:, 6]
+    return boxes_bev
--- a/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
@@ -7,8 +7,8 @@
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
-#include <torch/extension.h>
 #include <torch/serialize/tensor.h>
+#include <torch/types.h>

 #define THREADS_PER_BLOCK 256
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

--- a/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+++ b/mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//Modified from
-//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
-//RoI-aware point cloud feature pooling
-//Written by Shaoshuai Shi
-//All Rights Reserved 2019.
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// RoI-aware point cloud feature pooling
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.

-
-#include <torch/serialize/tensor.h>
-#include <torch/extension.h>
 #include <assert.h>
-
 #include <math.h>
 #include <stdio.h>
+#include <torch/serialize/tensor.h>
+#include <torch/types.h>

 #define THREADS_PER_BLOCK 256
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))

 // #define DEBUG

-
-__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rz, float &local_x, float &local_y){
-    // should rotate pi/2 + alpha to translate LiDAR to local
-    float rot_angle = rz + M_PI / 2;
-    float cosa = cos(rot_angle), sina = sin(rot_angle);
-    local_x = shift_x * cosa + shift_y * (-sina);
-    local_y = shift_x * sina + shift_y * cosa;
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
+                                             float rz, float &local_x,
+                                             float &local_y) {
+  // should rotate pi/2 + alpha to translate LiDAR to local
+  float rot_angle = rz + M_PI / 2;
+  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
 }

-
-__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){
-    // param pt: (x, y, z)
-    // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
-    float x = pt[0], y = pt[1], z = pt[2];
-    float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-    float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
-    cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
-
-    if (fabsf(z - cz) > h / 2.0) return 0;
-    lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
-    float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) & (local_y > -w / 2.0) & (local_y < w / 2.0);
-    return in_flag;
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
+                                        float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
+  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > h / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  return in_flag;
 }

-
-__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z,
-    const float *rois, const float *pts, int *pts_mask){
-    // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
-    // params pts: (npoints, 3) [x, y, z]
-    // params pts_mask: (N, npoints): -1 means point doesnot in this box, otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
-    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int box_idx = blockIdx.y;
-    if (pt_idx >= pts_num || box_idx >= boxes_num) return;
-
-    pts += pt_idx * 3;
-    rois += box_idx * 7;
-    pts_mask += box_idx * pts_num + pt_idx;
-
-    float local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
-
-    pts_mask[0] = -1;
-    if (cur_in_flag > 0){
-        float local_z = pts[2] - rois[2];
-        float w = rois[3], l = rois[4], h = rois[5];
-
-        float x_res = l / out_x;
-        float y_res = w / out_y;
-        float z_res = h / out_z;
-
-        unsigned int x_idx = int((local_x + l / 2) / x_res);
-        unsigned int y_idx = int((local_y + w / 2) / y_res);
-        unsigned int z_idx = int(local_z / z_res);
-
-        x_idx = min(max(x_idx, 0), out_x - 1);
-        y_idx = min(max(y_idx, 0), out_y - 1);
-        z_idx = min(max(z_idx, 0), out_z - 1);
-
-        unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const float *rois, const float *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z]
+  // params pts_mask: (N, npoints): -1 means point doesnot in this box,
+  // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int box_idx = blockIdx.y;
+  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+
+  pts += pt_idx * 3;
+  rois += box_idx * 7;
+  pts_mask += box_idx * pts_num + pt_idx;
+
+  float local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+  pts_mask[0] = -1;
+  if (cur_in_flag > 0) {
+    float local_z = pts[2] - rois[2];
+    float w = rois[3], l = rois[4], h = rois[5];
+
+    float x_res = l / out_x;
+    float y_res = w / out_y;
+    float z_res = h / out_z;
+
+    unsigned int x_idx = int((local_x + l / 2) / x_res);
+    unsigned int y_idx = int((local_y + w / 2) / y_res);
+    unsigned int z_idx = int(local_z / z_res);
+
+    x_idx = min(max(x_idx, 0), out_x - 1);
+    y_idx = min(max(y_idx, 0), out_y - 1);
+    z_idx = min(max(z_idx, 0), out_z - 1);
+
+    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
 #ifdef DEBUG
-        printf("mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, %d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
-            pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx, z_idx, x_res, y_res, z_res, idx_encoding);
+    printf(
+        "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
+        "%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
+        pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
+        z_idx, x_res, y_res, z_res, idx_encoding);
 #endif

-        pts_mask[0] = idx_encoding;
-    }
+    pts_mask[0] = idx_encoding;
+  }
 }

-
-__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel,
-    int out_x, int out_y, int out_z, const int *pts_mask, int *pts_idx_of_voxels){
-    // params pts_mask: (N, npoints)  0 or 1
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-
-    int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (box_idx >= boxes_num) return;
-
-    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-
-    for (int k = 0; k < pts_num; k++){
-        if (pts_mask[box_idx * pts_num + k] != -1){
-            unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-            unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-            unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-            unsigned int z_idx = idx_encoding & 0xFF;
-            unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel;
-            unsigned int cnt = pts_idx_of_voxels[base_offset];
-            if (cnt < max_num_pts){
-                pts_idx_of_voxels[base_offset + cnt + 1] = k;
-                pts_idx_of_voxels[base_offset]++;
-            }
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             int *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+
+  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (box_idx >= boxes_num) return;
+
+  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_mask[box_idx * pts_num + k] != -1) {
+      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+      unsigned int z_idx = idx_encoding & 0xFF;
+      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                 y_idx * out_z * max_pts_each_voxel +
+                                 z_idx * max_pts_each_voxel;
+      unsigned int cnt = pts_idx_of_voxels[base_offset];
+      if (cnt < max_num_pts) {
+        pts_idx_of_voxels[base_offset + cnt + 1] = k;
+        pts_idx_of_voxels[base_offset]++;
+      }
 #ifdef DEBUG
-        printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n",
-            k, x_idx, y_idx, z_idx, idx_encoding);
+      printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
+             y_idx, z_idx, idx_encoding);
 #endif
-
-        }
    }
+  }
 }

-
-__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
-    int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features, int *argmax){
-    // params pts_feature: (npoints, C)
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter
-    // params pooled_features: (N, out_x, out_y, out_z, C)
-    // params argmax: (N, out_x, out_y, out_z, C)
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const float *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   float *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;

 #ifdef DEBUG
-    printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels, argmax);
+  printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
+         argmax);
 #endif

-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
-    pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-    argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;

-    int argmax_idx = -1;
-    float max_val = -1e50;
+  int argmax_idx = -1;
+  float max_val = -1e50;

-    int total_pts = pts_idx_of_voxels[0];
+  int total_pts = pts_idx_of_voxels[0];

-    for (int k = 1; k <= total_pts; k++){
-        if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val){
-            max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-            argmax_idx = pts_idx_of_voxels[k];
-        }
+  for (int k = 1; k <= total_pts; k++) {
+    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
+      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+      argmax_idx = pts_idx_of_voxels[k];
    }
+  }

-    if (argmax_idx != -1){
-        pooled_features[0] = max_val;
-    }
-    argmax[0] = argmax_idx;
+  if (argmax_idx != -1) {
+    pooled_features[0] = max_val;
+  }
+  argmax[0] = argmax_idx;

 #ifdef DEBUG
-    printf("channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after pts_idx: %p, argmax: (%p, %d)\n",
-        channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts, pts_idx_of_voxels, argmax, argmax_idx);
+  printf(
+      "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
+      "pts_idx: %p, argmax: (%p, %d)\n",
+      channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
+      pts_idx_of_voxels, argmax, argmax_idx);
 #endif
 }

-
-__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
-    int out_y, int out_z, const float *pts_feature, const int *pts_idx_of_voxels, float *pooled_features){
-    // params pts_feature: (npoints, C)
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), index 0 is the counter
-    // params pooled_features: (N, out_x, out_y, out_z, C)
-    // params argmax: (N, out_x, out_y, out_z, C)
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
-    pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-
-    float sum_val = 0;
-    int total_pts = pts_idx_of_voxels[0];
-
-    for (int k = 1; k <= total_pts; k++){
-        sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-    }
-
-    if (total_pts > 0){
-        pooled_features[0] = sum_val / total_pts;
-    }
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const float *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   float *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+
+  float sum_val = 0;
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+  }
+
+  if (total_pts > 0) {
+    pooled_features[0] = sum_val / total_pts;
+  }
 }

-
-
-void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z,
-    const float *rois, const float *pts, const float *pts_feature, int *argmax, int *pts_idx_of_voxels, float *pooled_features, int pool_method){
-    // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
-    // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
-    // params pts_feature: (npoints, C)
-    // params argmax: (N, out_x, out_y, out_z, C)
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-    // params pooled_features: (N, out_x, out_y, out_z, C)
-    // params pool_method: 0: max_pool 1: avg_pool
-
-    int *pts_mask = NULL;
-    cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
-    cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
-
-    dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
-    dim3 threads(THREADS_PER_BLOCK);
-    generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
-
-    // TODO: Merge the collect and pool functions, SS
-
-    dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
-    collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(boxes_num, pts_num, max_pts_each_voxel,
-        out_x, out_y, out_z, pts_mask, pts_idx_of_voxels);
-
-    dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num);
-    if (pool_method == 0){
-        roiaware_maxpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-            pts_feature, pts_idx_of_voxels, pooled_features, argmax);
-    }
-    else if (pool_method == 1){
-        roiaware_avgpool3d<<<blocks_pool, threads>>>(boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-            pts_feature, pts_idx_of_voxels, pooled_features);
-    }
-
-
-    cudaFree(pts_mask);
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
+                              int max_pts_each_voxel, int out_x, int out_y,
+                              int out_z, const float *rois, const float *pts,
+                              const float *pts_feature, int *argmax,
+                              int *pts_idx_of_voxels, float *pooled_features,
+                              int pool_method) {
+  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+
+  int *pts_mask = NULL;
+  cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
+  cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
+
+  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+  generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
+      boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+  collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
+      boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
+      pts_idx_of_voxels);
+
+  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+                   boxes_num);
+  if (pool_method == 0) {
+    roiaware_maxpool3d<<<blocks_pool, threads>>>(
+        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        pts_feature, pts_idx_of_voxels, pooled_features, argmax);
+  } else if (pool_method == 1) {
+    roiaware_avgpool3d<<<blocks_pool, threads>>>(
+        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        pts_feature, pts_idx_of_voxels, pooled_features);
+  }
+
+  cudaFree(pts_mask);

 #ifdef DEBUG
-    cudaDeviceSynchronize();  // for using printf in kernel function
+  cudaDeviceSynchronize();  // for using printf in kernel function
 #endif
 }

-
-__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z,
-    const int *argmax, const float *grad_out, float *grad_in){
-    // params argmax: (N, out_x, out_y, out_z, C)
-    // params grad_out: (N, out_x, out_y, out_z, C)
-    // params grad_in: (npoints, C), return value
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-    grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-
-    if (argmax[0] == -1) return;
-
-    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const float *grad_out,
+                                            float *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  if (argmax[0] == -1) return;
+
+  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
 }

-
-__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z,
-    int max_pts_each_voxel, const int *pts_idx_of_voxels, const float *grad_out, float *grad_in){
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-    // params grad_out: (N, out_x, out_y, out_z, C)
-    // params grad_in: (npoints, C), return value
-
-    int box_idx = blockIdx.z;
-    int channel_idx = blockIdx.y;
-    int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-    int x_idx = voxel_idx_flat / (out_y * out_z);
-    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-    int z_idx = voxel_idx_flat % out_z;
-    if (box_idx >= boxes_num || channel_idx >= channels|| x_idx >= out_x || y_idx >= out_y || z_idx >= out_z) return;
-
-    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel;
-    grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
-
-
-    int total_pts = pts_idx_of_voxels[0];
-    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
-    for (int k = 1; k <= total_pts; k++){
-        atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad);
-    }
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const float *grad_out,
+                                            float *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  int total_pts = pts_idx_of_voxels[0];
+  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+  for (int k = 1; k <= total_pts; k++) {
+    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+              grad_out[0] * cur_grad);
+  }
 }

-
-
-void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y, int out_z, int channels, int max_pts_each_voxel,
-    const int *pts_idx_of_voxels, const int *argmax, const float *grad_out, float *grad_in, int pool_method){
-    // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-    // params argmax: (N, out_x, out_y, out_z, C)
-    // params grad_out: (N, out_x, out_y, out_z, C)
-    // params grad_in: (npoints, C), return value
-    // params pool_method: 0: max_pool, 1: avg_pool
-
-    dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, boxes_num);
-    dim3 threads(THREADS_PER_BLOCK);
-    if (pool_method == 0){
-        roiaware_maxpool3d_backward<<<blocks, threads>>>(
-            boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in
-        );
-    }
-    else if (pool_method == 1){
-        roiaware_avgpool3d_backward<<<blocks, threads>>>(
-            boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel, pts_idx_of_voxels, grad_out, grad_in
-        );
-    }
-
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
+                                       int out_z, int channels,
+                                       int max_pts_each_voxel,
+                                       const int *pts_idx_of_voxels,
+                                       const int *argmax, const float *grad_out,
+                                       float *grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+  if (pool_method == 0) {
+    roiaware_maxpool3d_backward<<<blocks, threads>>>(
+        boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
+  } else if (pool_method == 1) {
+    roiaware_avgpool3d_backward<<<blocks, threads>>>(
+        boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+        pts_idx_of_voxels, grad_out, grad_in);
+  }
 }
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
 from mmcv.cnn import build_norm_layer
 from torch import nn

-import mmdet3d.ops.spconv as spconv
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from . import spconv


 def conv3x3(in_planes, out_planes, stride=1, indice_key=None):

--- a/tests/test_samplers.py
+++ b/tests/test_samplers.py
+import torch
+
+from mmdet3d.core.bbox.assigners import MaxIoUAssigner
+from mmdet3d.core.bbox.samplers import IoUNegPiecewiseSampler
+
+
+def test_iou_piecewise_sampler():
+    assigner = MaxIoUAssigner(
+        pos_iou_thr=0.55,
+        neg_iou_thr=0.55,
+        min_pos_iou=0.55,
+        ignore_iof_thr=-1,
+        iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'))
+    bboxes = torch.tensor(
+        [[32, 32, 16, 8, 38, 42, -0.3], [32, 32, 16, 8, 38, 42, -0.3],
+         [32, 32, 16, 8, 38, 42, -0.3], [32, 32, 16, 8, 38, 42, -0.3],
+         [0, 0, 0, 10, 10, 10, 0.2], [10, 10, 10, 20, 20, 15, 0.6],
+         [5, 5, 5, 15, 15, 15, 0.7], [5, 5, 5, 15, 15, 15, 0.7],
+         [5, 5, 5, 15, 15, 15, 0.7], [32, 32, 16, 8, 38, 42, -0.3],
+         [32, 32, 16, 8, 38, 42, -0.3], [32, 32, 16, 8, 38, 42, -0.3]],
+        dtype=torch.float32).cuda()
+    gt_bboxes = torch.tensor(
+        [[0, 0, 0, 10, 10, 9, 0.2], [5, 10, 10, 20, 20, 15, 0.6]],
+        dtype=torch.float32).cuda()
+    gt_labels = torch.tensor([1, 1], dtype=torch.int64).cuda()
+    assign_result = assigner.assign(bboxes, gt_bboxes, gt_labels=gt_labels)
+
+    sampler = IoUNegPiecewiseSampler(
+        num=10,
+        pos_fraction=0.55,
+        neg_piece_fractions=[0.8, 0.2],
+        neg_iou_piece_thrs=[0.55, 0.1],
+        neg_pos_ub=-1,
+        add_gt_as_proposals=False)
+
+    sample_result = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+
+    assert sample_result.pos_inds == 4
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
--- a/tools/dist_test.sh
+++ b/tools/dist_test.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
--- a/tools/dist_train.sh
+++ b/tools/dist_train.sh
 #!/usr/bin/env bash

-PYTHON=${PYTHON:-"python"}
-
 CONFIG=$1
 GPUS=$2
+PORT=${PORT:-29500}

-$PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
--- a/tools/slurm_test.sh
+++ b/tools/slurm_test.sh
 #!/usr/bin/env bash

 set -x
-export PYTHONPATH=`pwd`:$PYTHONPATH

 PARTITION=$1
 JOB_NAME=$2
@@ -9,14 +8,17 @@ CONFIG=$3
 CHECKPOINT=$4
 GPUS=${GPUS:-8}
 GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
 PY_ARGS=${@:5}
 SRUN_ARGS=${SRUN_ARGS:-""}

+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
--- a/tools/slurm_train.sh
+++ b/tools/slurm_train.sh
@@ -8,15 +8,17 @@ CONFIG=$3
 WORK_DIR=$4
 GPUS=${GPUS:-8}
 GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
 SRUN_ARGS=${SRUN_ARGS:-""}
-PY_ARGS=${PY_ARGS:-"--validate"}
-
+PY_ARGS=${@:5}

+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
 srun -p ${PARTITION} \
    --job-name=${JOB_NAME} \
    --gres=gpu:${GPUS_PER_NODE} \
    --ntasks=${GPUS} \
    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
    --kill-on-bad-exit=1 \
    ${SRUN_ARGS} \
    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}