Merge branch 'master' into process_raw_data

f2b01720 · liyinhao · 08c8adb6 · 47850641 · f2b01720 · f2b01720
Commit f2b01720 authored Jun 10, 2020 by liyinhao
20 changed files
--- a/mmdet3d/core/bbox/structures/cam_box3d.py
+++ b/mmdet3d/core/bbox/structures/cam_box3d.py
@@ -11,10 +11,10 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
    Coordinates in camera:
    .. code-block:: none

-                z front
+                z front (yaw=0.5*pi)
               /
              /
-             0 ------> x right
+             0 ------> x right (yaw=0)
             |
             |
             v
@@ -22,11 +22,15 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):

    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
    and the yaw is around the y axis, thus the rotation axis=1.
+    The yaw is 0 at the positive direction of x axis, and increases from
+    the positive direction of x to the positive direction of z.

    Attributes:
        tensor (torch.Tensor): float matrix of N x box_dim.
        box_dim (int): integer indicates the dimension of a box
            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): if True, the value of yaw will be set to 0 as minmax
+            boxes.
    """

    @property
@@ -75,7 +79,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
        """Calculate the coordinates of corners of all the boxes.

        Convert the boxes to  in clockwise order, in the form of
-        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z0, x1y1z1)
+        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)

        .. code-block:: none

@@ -85,7 +89,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
               (x0, y0, z1) + -----------  + (x1, y0, z1)
                           /|            / |
                          / |           /  |
-            (x0, y0, z0) + ----------- +   + (x1, y1, z0)
+            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
                         |  /      .   |  /
                         | / oriign    | /
            (x0, y1, z0) + ----------- + -------> x right
@@ -123,7 +127,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
        return self.tensor[:, [0, 2, 3, 5, 6]]

    @property
-    def nearset_bev(self):
+    def nearest_bev(self):
        """Calculate the 2D bounding boxes in BEV without rotation

        Returns:
@@ -150,11 +154,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
        """Calculate whether the points is in any of the boxes

        Args:
-            angles (float | torch.Tensor): rotation angle
-
-        Returns:
-            None if `return_rot_mat=False`,
-            torch.Tensor if `return_rot_mat=True`
+            angle (float | torch.Tensor): rotation angle
        """
        if not isinstance(angle, torch.Tensor):
            angle = self.tensor.new_tensor(angle)
@@ -166,13 +166,23 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
        self.tensor[:, 6] += angle

-    def flip(self):
-        """Flip the boxes in horizontal direction
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction
+
+        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.

-        In CAM coordinates, it flips the x axis.
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]

    def in_range_bev(self, box_range):
        """Check whether the boxes are in the given range
@@ -188,7 +198,7 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
            TODO: check whether this will effect the performance

        Returns:
-            a binary vector, indicating whether each box is inside
+            torch.Tensor: Indicating whether each box is inside
                the reference range.
        """
        in_range_flags = ((self.tensor[:, 0] > box_range[0])
@@ -230,3 +240,22 @@ class CameraInstance3DBoxes(BaseInstance3DBoxes):
        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
        return overlaps_h
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to `dst` mode.
+
+        Args:
+            dst (BoxMode): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            BaseInstance3DBoxes:
+                The converted box of the same type in the `dst` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat)
--- a/mmdet3d/core/bbox/structures/depth_box3d.py
+++ b/mmdet3d/core/bbox/structures/depth_box3d.py
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in Depth coordinates
+
+    Coordinates in Depth:
+    .. code-block:: none
+
+                    up z    y front (yaw=0.5*pi)
+                       ^   ^
+                       |  /
+                       | /
+                       0 ------> x right (yaw=0)
+
+    The relative coordinate of bottom center in a Depth box is [0.5, 0.5, 0],
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the positive direction of x axis, and increases from
+    the positive direction of x to the positive direction of y.
+
+    Attributes:
+        tensor (torch.Tensor): float matrix of N x box_dim.
+        box_dim (int): integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): if True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    @property
+    def gravity_center(self):
+        """Calculate the gravity center of all the boxes.
+
+        Returns:
+            torch.Tensor: a tensor with center of each box.
+        """
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """Calculate the coordinates of corners of all the boxes.
+
+        Convert the boxes to corners in clockwise order, in form of
+        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
+
+        .. code-block:: none
+
+                                           up z
+                            front y           ^
+                                 /            |
+                                /             |
+                  (x0, y1, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / oriign    | /
+               (x0, y0, z0) + ----------- + --------> right x
+                                          (x1, y0, z0)
+
+        Returns:
+            torch.Tensor: corners of each box with size (N, 8, 3)
+        """
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 0.5, 0]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """Calculate the 2D bounding boxes in BEV with rotation
+
+        Returns:
+            torch.Tensor: a nx5 tensor of 2D BEV box of each box.
+                The box is in XYWHR format
+        """
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self):
+        """Calculate the 2D bounding boxes in BEV without rotation
+
+        Returns:
+            torch.Tensor: a tensor of 2D BEV box of each box.
+        """
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def rotate(self, angle):
+        """Calculate whether the points is in any of the boxes
+
+        Args:
+            angle (float | torch.Tensor): rotation angle
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        rot_sin = torch.sin(angle)
+        rot_cos = torch.cos(angle)
+        rot_mat = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
+                                          [rot_sin, rot_cos, 0], [0, 0, 1]])
+        self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat.T
+        if self.with_yaw:
+            self.tensor[:, 6] -= angle
+        else:
+            corners_rot = self.corners @ rot_mat.T
+            new_x_size = corners_rot[..., 0].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
+                    dim=1, keepdim=True)[0]
+            new_y_size = corners_rot[..., 1].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
+                    dim=1, keepdim=True)[0]
+            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction
+
+        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range
+
+        Args:
+            box_range (list | torch.Tensor): the range of box
+                (x_min, y_min, x_max, y_max)
+
+        Note:
+            In the original implementation of SECOND, checking whether
+            a box in the range checks whether the points are in a convex
+            polygon, we try to reduce the burdun for simpler cases.
+            TODO: check whether this will effect the performance
+
+        Returns:
+            torch.Tensor: Indicating whether each box is inside
+                the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 0] < box_range[2])
+                          & (self.tensor[:, 1] < box_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to `dst` mode.
+
+        Args:
+            dst (BoxMode): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            BaseInstance3DBoxes:
+                The converted box of the same type in the `dst` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
--- a/mmdet3d/core/bbox/structures/lidar_box3d.py
+++ b/mmdet3d/core/bbox/structures/lidar_box3d.py
 import numpy as np
 import torch

+from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu
 from .base_box3d import BaseInstance3DBoxes
 from .utils import limit_period, rotation_3d_in_axis

@@ -11,19 +12,23 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
    Coordinates in LiDAR:
    .. code-block:: none

-                    up z    x front
+                            up z    x front (yaw=0.5*pi)
                               ^   ^
                               |  /
                               | /
-        left y <------ 0
+       (yaw=pi) left y <------ 0

    The relative coordinate of bottom center in a LiDAR box is [0.5, 0.5, 0],
    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the negative direction of y axis, and increases from
+    the negative direction of y to the positive direction of x.

    Attributes:
        tensor (torch.Tensor): float matrix of N x box_dim.
        box_dim (int): integer indicates the dimension of a box
            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): if True, the value of yaw will be set to 0 as minmax
+            boxes.
    """

    @property
@@ -44,7 +49,7 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
        """Calculate the coordinates of corners of all the boxes.

        Convert the boxes to corners in clockwise order, in form of
-        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z0, x1y1z1)
+        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)

        .. code-block:: none

@@ -90,7 +95,7 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
        return self.tensor[:, [0, 1, 3, 4, 6]]

    @property
-    def nearset_bev(self):
+    def nearest_bev(self):
        """Calculate the 2D bounding boxes in BEV without rotation

        Returns:
@@ -117,11 +122,7 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
        """Calculate whether the points is in any of the boxes

        Args:
-            angles (float | torch.Tensor): rotation angle
-
-        Returns:
-            None if `return_rot_mat=False`,
-            torch.Tensor if `return_rot_mat=True`
+            angle (float | torch.Tensor): rotation angle
        """
        if not isinstance(angle, torch.Tensor):
            angle = self.tensor.new_tensor(angle)
@@ -133,13 +134,27 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
        self.tensor[:, 6] += angle

-    def flip(self):
-        """Flip the boxes in horizontal direction
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction

-        In LIDAR coordinates, it flips the y axis.
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]

    def in_range_bev(self, box_range):
        """Check whether the boxes are in the given range
@@ -155,7 +170,7 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
            TODO: check whether this will effect the performance

        Returns:
-            a binary vector, indicating whether each box is inside
+            torch.Tensor: Indicating whether each box is inside
                the reference range.
        """
        in_range_flags = ((self.tensor[:, 0] > box_range[0])
@@ -163,3 +178,51 @@ class LiDARInstance3DBoxes(BaseInstance3DBoxes):
                          & (self.tensor[:, 0] < box_range[2])
                          & (self.tensor[:, 1] < box_range[3]))
        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to `dst` mode.
+
+        Args:
+            dst (BoxMode): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            BaseInstance3DBoxes:
+                The converted box of the same type in the `dst` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes
+
+        Args:
+            extra_width (float | torch.Tensor): extra width to enlarge the box
+
+        Returns:
+            :obj:LiDARInstance3DBoxes: enlarged boxes
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def points_in_boxes(self, points):
+        """Find the box which the points are in.
+
+        Args:
+            points (:obj:torch.Tensor): Points in shape Nx3
+
+        Returns:
+            torch.Tensor: The index of box where each point are in.
+        """
+        box_idx = points_in_boxes_gpu(
+            points.unsqueeze(0),
+            self.tensor.unsqueeze(0).to(points.device)).squeeze(0)
+        return box_idx
--- a/mmdet3d/core/optimizer/cocktail_constructor.py
+++ b/mmdet3d/core/optimizer/cocktail_constructor.py
+from mmcv.runner.optimizer import OPTIMIZER_BUILDERS, OPTIMIZERS
 from mmcv.utils import build_from_cfg

 from mmdet3d.utils import get_root_logger
-from mmdet.core.optimizer import OPTIMIZER_BUILDERS, OPTIMIZERS
 from .cocktail_optimizer import CocktailOptimizer



--- a/mmdet3d/core/optimizer/cocktail_optimizer.py
+++ b/mmdet3d/core/optimizer/cocktail_optimizer.py
+from mmcv.runner.optimizer import OPTIMIZERS
 from torch.optim import Optimizer

-from mmdet.core.optimizer import OPTIMIZERS
-

 @OPTIMIZERS.register_module()
 class CocktailOptimizer(Optimizer):
@@ -9,6 +8,11 @@ class CocktailOptimizer(Optimizer):

    This optimizer applies the cocktail optimzation for multi-modality models.

+    Args:
+        optimizers (list[:obj:`torch.optim.Optimizer`]): The list containing
+            different optimizers that optimize different parameters
+        step_intervals (list[int]): Step intervals of each optimizer
+
    """

    def __init__(self, optimizers, step_intervals=None):
@@ -18,6 +22,9 @@ class CocktailOptimizer(Optimizer):
            self.param_groups += optimizer.param_groups
        if not isinstance(step_intervals, list):
            step_intervals = [1] * len(self.optimizers)
+        assert len(step_intervals) == len(optimizers), \
+            '"step_intervals" should contain the same number of intervals as' \
+            f'len(optimizers)={len(optimizers)}, got {step_intervals}'
        self.step_intervals = step_intervals
        self.num_step_updated = 0


--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -9,7 +9,7 @@ import torch
 from mmcv.utils import print_log

 from mmdet.datasets import DATASETS
-from ..core.bbox import box_np_ops
+from ..core.bbox import Box3DMode, CameraInstance3DBoxes, box_np_ops
 from .custom_3d import Custom3DDataset
 from .utils import remove_dontcare

@@ -87,13 +87,14 @@ class KittiDataset(Custom3DDataset):
        # print(gt_names, len(loc))
        gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
                                      axis=1).astype(np.float32)
-        # this change gt_bboxes_3d to velodyne coordinates
-        gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect,
-                                                      Trv2c)
+
+        # convert gt_bboxes_3d to velodyne coordinates
+        gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
        gt_bboxes = annos['bbox']

        selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
-        gt_bboxes_3d = gt_bboxes_3d[selected].astype('float32')
+        # gt_bboxes_3d = gt_bboxes_3d[selected].astype('float32')
        gt_bboxes = gt_bboxes[selected].astype('float32')
        gt_names = gt_names[selected]


--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -7,7 +7,7 @@ import pyquaternion
 from nuscenes.utils.data_classes import Box as NuScenesBox

 from mmdet.datasets import DATASETS
-from ..core.bbox import box_np_ops
+from ..core.bbox import LiDARInstance3DBoxes, box_np_ops
 from .custom_3d import Custom3DDataset


@@ -152,10 +152,6 @@ class NuScenesDataset(Custom3DDataset):
        # filter out bbox containing no points
        mask = info['num_lidar_pts'] > 0
        gt_bboxes_3d = info['gt_boxes'][mask]
-        # the nuscenes box center is [0.5, 0.5, 0.5], we keep it
-        # the same as KITTI [0.5, 0.5, 0]
-        box_np_ops.change_box3d_center_(gt_bboxes_3d, [0.5, 0.5, 0.5],
-                                        [0.5, 0.5, 0])
        gt_names_3d = info['gt_names'][mask]
        gt_labels_3d = []
        for cat in gt_names_3d:
@@ -171,6 +167,13 @@ class NuScenesDataset(Custom3DDataset):
            gt_velocity[nan_mask] = [0.0, 0.0]
            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)

+        # the nuscenes box center is [0.5, 0.5, 0.5], we keep it
+        # the same as KITTI [0.5, 0.5, 0]
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=[0.5, 0.5, 0.5])
+
        anns_results = dict(
            gt_bboxes_3d=gt_bboxes_3d,
            gt_labels_3d=gt_labels_3d,

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
 import numpy as np
 from mmcv.parallel import DataContainer as DC

+from mmdet3d.core.bbox import BaseInstance3DBoxes
 from mmdet.datasets.builder import PIPELINES
 from mmdet.datasets.pipelines import to_tensor

@@ -39,9 +40,8 @@ class DefaultFormatBundle(object):
                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
                results['img'] = DC(to_tensor(img), stack=True)
        for key in [
-                'proposals', 'gt_bboxes', 'gt_bboxes_3d', 'gt_bboxes_ignore',
-                'gt_labels', 'gt_labels_3d', 'pts_instance_mask',
-                'pts_semantic_mask'
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask'
        ]:
            if key not in results:
                continue
@@ -49,6 +49,14 @@ class DefaultFormatBundle(object):
                results[key] = DC([to_tensor(res) for res in results[key]])
            else:
                results[key] = DC(to_tensor(results[key]))
+        if 'gt_bboxes_3d' in results:
+            if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = DC(
+                    results['gt_bboxes_3d'], cpu_only=True)
+            else:
+                results['gt_bboxes_3d'] = DC(
+                    to_tensor(results['gt_bboxes_3d']))
+
        if 'gt_masks' in results:
            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
        if 'gt_semantic_seg' in results:

--- a/mmdet3d/datasets/pipelines/train_aug.py
+++ b/mmdet3d/datasets/pipelines/train_aug.py
@@ -26,12 +26,8 @@ class RandomFlip3D(RandomFlip):
        self.sync_2d = sync_2d

    def random_flip_points(self, gt_bboxes_3d, points):
-        gt_bboxes_3d[:, 1] = -gt_bboxes_3d[:, 1]
-        gt_bboxes_3d[:, 6] = -gt_bboxes_3d[:, 6] + np.pi
+        gt_bboxes_3d.flip()
        points[:, 1] = -points[:, 1]
-        if gt_bboxes_3d.shape[1] == 9:
-            # flip velocitys at the same time
-            gt_bboxes_3d[:, 8] = -gt_bboxes_3d[:, 8]
        return gt_bboxes_3d, points

    def __call__(self, input_dict):
@@ -121,10 +117,13 @@ class ObjectSample(object):
            gt_bboxes_2d = input_dict['gt_bboxes']
            # Assume for now 3D & 2D bboxes are the same
            sampled_dict = self.db_sampler.sample_all(
-                gt_bboxes_3d, gt_labels_3d, gt_bboxes_2d=gt_bboxes_2d, img=img)
+                gt_bboxes_3d.tensor.numpy(),
+                gt_labels_3d,
+                gt_bboxes_2d=gt_bboxes_2d,
+                img=img)
        else:
            sampled_dict = self.db_sampler.sample_all(
-                gt_bboxes_3d, gt_labels_3d, img=None)
+                gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None)

        if sampled_dict is not None:
            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
@@ -133,8 +132,9 @@ class ObjectSample(object):

            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
                                          axis=0)
-            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, sampled_gt_bboxes_3d
-                                           ]).astype(np.float32)
+            gt_bboxes_3d = gt_bboxes_3d.new_box(
+                np.concatenate(
+                    [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))

            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
            # check the points dimension
@@ -178,14 +178,16 @@ class ObjectNoise(object):
        points = input_dict['points']

        # TODO: check this inplace function
+        numpy_box = gt_bboxes_3d.tensor.numpy()
        noise_per_object_v3_(
-            gt_bboxes_3d,
+            numpy_box,
            points,
            rotation_perturb=self.rot_uniform_noise,
            center_noise_std=self.loc_noise_std,
            global_random_rot_range=self.global_rot_range,
            num_try=self.num_try)
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
        input_dict['points'] = points
        return input_dict

@@ -212,7 +214,7 @@ class GlobalRotScale(object):
    def _trans_bbox_points(self, gt_boxes, points):
        noise_trans = np.random.normal(0, self.trans_normal_noise[0], 3).T
        points[:, :3] += noise_trans
-        gt_boxes[:, :3] += noise_trans
+        gt_boxes.translate(noise_trans)
        return gt_boxes, points, noise_trans

    def _rot_bbox_points(self, gt_boxes, points, rotation=np.pi / 4):
@@ -221,16 +223,8 @@ class GlobalRotScale(object):
        noise_rotation = np.random.uniform(rotation[0], rotation[1])
        points[:, :3], rot_mat_T = box_np_ops.rotation_points_single_angle(
            points[:, :3], noise_rotation, axis=2)
-        gt_boxes[:, :3], _ = box_np_ops.rotation_points_single_angle(
-            gt_boxes[:, :3], noise_rotation, axis=2)
-        gt_boxes[:, 6] += noise_rotation
-        if gt_boxes.shape[1] == 9:
-            # rotate velo vector
-            rot_cos = np.cos(noise_rotation)
-            rot_sin = np.sin(noise_rotation)
-            rot_mat_T_bev = np.array([[rot_cos, -rot_sin], [rot_sin, rot_cos]],
-                                     dtype=points.dtype)
-            gt_boxes[:, 7:9] = gt_boxes[:, 7:9] @ rot_mat_T_bev
+        gt_boxes.rotate(noise_rotation)
+
        return gt_boxes, points, rot_mat_T

    def _scale_bbox_points(self,
@@ -240,9 +234,7 @@ class GlobalRotScale(object):
                           max_scale=1.05):
        noise_scale = np.random.uniform(min_scale, max_scale)
        points[:, :3] *= noise_scale
-        gt_boxes[:, :6] *= noise_scale
-        if gt_boxes.shape[1] == 9:
-            gt_boxes[:, 7:] *= noise_scale
+        gt_boxes.scale(noise_scale)
        return gt_boxes, points, noise_scale

    def __call__(self, input_dict):
@@ -256,7 +248,7 @@ class GlobalRotScale(object):
        gt_bboxes_3d, points, trans_factor = self._trans_bbox_points(
            gt_bboxes_3d, points)

-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
        input_dict['points'] = points
        input_dict['pcd_scale_factor'] = scale_factor
        input_dict['pcd_rotation'] = rotation_factor
@@ -290,10 +282,6 @@ class ObjectRangeFilter(object):
        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
        self.bev_range = self.pcd_range[[0, 1, 3, 4]]

-    @staticmethod
-    def limit_period(val, offset=0.5, period=np.pi):
-        return val - np.floor(val / period + offset) * period
-
    @staticmethod
    def filter_gt_box_outside_range(gt_bboxes_3d, limit_range):
        """remove gtbox outside training range.
@@ -314,14 +302,13 @@ class ObjectRangeFilter(object):
    def __call__(self, input_dict):
        gt_bboxes_3d = input_dict['gt_bboxes_3d']
        gt_labels_3d = input_dict['gt_labels_3d']
-        mask = self.filter_gt_box_outside_range(gt_bboxes_3d, self.bev_range)
+        mask = gt_bboxes_3d.in_range_bev(self.bev_range)
        gt_bboxes_3d = gt_bboxes_3d[mask]
        gt_labels_3d = gt_labels_3d[mask]

        # limit rad to [-pi, pi]
-        gt_bboxes_3d[:, 6] = self.limit_period(
-            gt_bboxes_3d[:, 6], offset=0.5, period=2 * np.pi)
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
        input_dict['gt_labels_3d'] = gt_labels_3d

        return input_dict

--- a/mmdet3d/models/dense_heads/train_mixins.py
+++ b/mmdet3d/models/dense_heads/train_mixins.py
@@ -168,6 +168,8 @@ class AnchorTrainMixin(object):
        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
        if len(gt_bboxes) > 0:
+            if not isinstance(gt_bboxes, torch.Tensor):
+                gt_bboxes = gt_bboxes.tensor.to(anchors.device)
            assign_result = bbox_assigner.assign(anchors, gt_bboxes,
                                                 gt_bboxes_ignore, gt_labels)
            sampling_result = self.bbox_sampler.sample(assign_result, anchors,

--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
@@ -121,8 +121,7 @@ class PartA2BboxHead(nn.Module):
                    3,
                    padding=1,
                    norm_cfg=norm_cfg,
-                    indice_key=f'rcnn_down0',
-                    conv_type='SubMConv3d'))
+                    indice_key='rcnn_down0'))
            merge_conv_channel_last = channel

        down_conv_channel_last = merge_conv_channel_last
@@ -135,8 +134,7 @@ class PartA2BboxHead(nn.Module):
                    3,
                    padding=1,
                    norm_cfg=norm_cfg,
-                    indice_key=f'rcnn_down1',
-                    conv_type='SubMConv3d'))
+                    indice_key='rcnn_down1'))
            down_conv_channel_last = channel

        self.conv_down.add_module('merge_conv',

--- a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
@@ -5,7 +5,6 @@ import torch.nn.functional as F
 from mmdet3d.core import multi_apply
 from mmdet3d.core.bbox import box_torch_ops
 from mmdet3d.models.builder import build_loss
-from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu
 from mmdet.models import HEADS


@@ -14,7 +13,7 @@ class PointwiseSemanticHead(nn.Module):
    """Semantic segmentation head for point-wise segmentation.

    Predict point-wise segmentation and part regression results for PartA2.
-    See https://arxiv.org/abs/1907.03670 for more detials.
+    See `paper <https://arxiv.org/abs/1907.03670>`_ for more detials.

    Args:
        in_channels (int): the number of input channel.
@@ -65,28 +64,27 @@ class PointwiseSemanticHead(nn.Module):
            seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats)

    def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d):
-        """generate segmentation and part prediction targets
+        """generate segmentation and part prediction targets for a single sample

        Args:
            voxel_centers (torch.Tensor): shape [voxel_num, 3],
                the center of voxels
-            gt_bboxes_3d (torch.Tensor): shape [box_num, 7], gt boxes
+            gt_bboxes_3d (:obj:BaseInstance3DBoxes): gt boxes containing tensor
+                of shape [box_num, 7].
            gt_labels_3d (torch.Tensor): shape [box_num], class label of gt

        Returns:
            tuple : segmentation targets with shape [voxel_num]
                part prediction targets with shape [voxel_num, 3]
        """
-        enlarged_gt_boxes = box_torch_ops.enlarge_box3d_lidar(
-            gt_bboxes_3d, extra_width=self.extra_width)
+        gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device)
+        enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
+
        part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3),
                                               dtype=torch.float32)
-        box_idx = points_in_boxes_gpu(
-            voxel_centers.unsqueeze(0),
-            gt_bboxes_3d.unsqueeze(0)).squeeze(0)  # -1 ~ box_num
-        enlarge_box_idx = points_in_boxes_gpu(
-            voxel_centers.unsqueeze(0),
-            enlarged_gt_boxes.unsqueeze(0)).squeeze(0).long()  # -1 ~ box_num
+        box_idx = gt_bboxes_3d.points_in_boxes(voxel_centers)
+        enlarge_box_idx = enlarged_gt_boxes.points_in_boxes(
+            voxel_centers).long()

        gt_labels_pad = F.pad(
            gt_labels_3d, (1, 0), mode='constant', value=self.num_classes)
@@ -95,24 +93,37 @@ class PointwiseSemanticHead(nn.Module):
        ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1)
        seg_targets[ignore_flag] = -1

-        for k in range(gt_bboxes_3d.shape[0]):
+        for k in range(len(gt_bboxes_3d)):
            k_box_flag = box_idx == k
            # no point in current box (caused by velodyne reduce)
            if not k_box_flag.any():
                continue
            fg_voxels = voxel_centers[k_box_flag]
-            transformed_voxels = fg_voxels - gt_bboxes_3d[k, 0:3]
+            transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k]
            transformed_voxels = box_torch_ops.rotation_3d_in_axis(
                transformed_voxels.unsqueeze(0),
-                -gt_bboxes_3d[k, 6].view(1),
+                -gt_bboxes_3d.yaw[k].view(1),
                axis=2)
-            part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d[
-                k, 3:6] + voxel_centers.new_tensor([0.5, 0.5, 0])
+            part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[
+                k] + voxel_centers.new_tensor([0.5, 0.5, 0])

        part_targets = torch.clamp(part_targets, min=0)
        return seg_targets, part_targets

    def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d):
+        """generate segmentation and part prediction targets
+
+        Args:
+            voxel_centers (torch.Tensor): shape [voxel_num, 3],
+                the center of voxels
+            gt_bboxes_3d (list[:obj:BaseInstance3DBoxes]): list of gt boxes
+                containing tensor of shape [box_num, 7].
+            gt_labels_3d (list[torch.Tensor]): list of GT labels.
+
+        Returns:
+            tuple : segmentation targets with shape [voxel_num]
+                part prediction targets with shape [voxel_num, 3]
+        """
        batch_size = len(gt_labels_3d)
        voxel_center_list = []
        for idx in range(batch_size):

--- a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
+++ b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
@@ -8,7 +8,7 @@ from ..builder import build_head, build_roi_extractor
 from .base_3droi_head import Base3DRoIHead


-@HEADS.register_module
+@HEADS.register_module()
 class PartAggregationROIHead(Base3DRoIHead):
    """Part aggregation roi head for PartA2"""

@@ -174,7 +174,7 @@ class PartAggregationROIHead(Base3DRoIHead):
            cur_proposal_list = proposal_list[batch_idx]
            cur_boxes = cur_proposal_list['boxes_3d']
            cur_labels_3d = cur_proposal_list['labels_3d']
-            cur_gt_bboxes = gt_bboxes_3d[batch_idx]
+            cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
            cur_gt_labels = gt_labels_3d[batch_idx]

            batch_num_gts = 0
@@ -189,7 +189,7 @@ class PartAggregationROIHead(Base3DRoIHead):
                    pred_per_cls = (cur_labels_3d == i)
                    cur_assign_res = assigner.assign(
                        cur_boxes[pred_per_cls],
-                        cur_gt_bboxes[gt_per_cls],
+                        cur_gt_bboxes.tensor[gt_per_cls],
                        gt_labels=cur_gt_labels[gt_per_cls])
                    # gather assign_results in different class into one result
                    batch_num_gts += cur_assign_res.num_gts
@@ -215,11 +215,11 @@ class PartAggregationROIHead(Base3DRoIHead):
                                             batch_gt_labels)
            else:  # for single class
                assign_result = self.bbox_assigner.assign(
-                    cur_boxes, cur_gt_bboxes, gt_labels=cur_gt_labels)
+                    cur_boxes, cur_gt_bboxes.tensor, gt_labels=cur_gt_labels)
            # sample boxes
            sampling_result = self.bbox_sampler.sample(assign_result,
                                                       cur_boxes,
-                                                       cur_gt_bboxes,
+                                                       cur_gt_bboxes.tensor,
                                                       cur_gt_labels)
            sampling_results.append(sampling_result)
        return sampling_results

--- a/mmdet3d/models/voxel_encoders/__init__.py
+++ b/mmdet3d/models/voxel_encoders/__init__.py
-from .pillar_encoder import AlignedPillarFeatureNet, PillarFeatureNet
-from .voxel_encoder import (DynamicVFE, VoxelFeatureExtractor,
-                            VoxelFeatureExtractorV2, VoxelFeatureExtractorV3)
+from .pillar_encoder import PillarFeatureNet
+from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE

 __all__ = [
-    'PillarFeatureNet', 'AlignedPillarFeatureNet', 'VoxelFeatureExtractor',
-    'DynamicVFE', 'VoxelFeatureExtractorV2', 'VoxelFeatureExtractorV3'
+    'PillarFeatureNet', 'HardVFE', 'DynamicVFE', 'HardSimpleVFE',
+    'DynamicSimpleVFE'
 ]
--- a/mmdet3d/models/voxel_encoders/pillar_encoder.py
+++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py
@@ -9,55 +9,54 @@ from .utils import PFNLayer, get_paddings_indicator

 @VOXEL_ENCODERS.register_module()
 class PillarFeatureNet(nn.Module):
+    """Pillar Feature Net.

-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 mode='max'):
-        """ Pillar Feature Net.
    The network prepares the pillar features and performs forward pass
    through PFNLayers.

    Args:
-            num_input_features (int). Number of input features,
+        in_channels (int). Number of input features,
            either x, y, z or x, y, z, r.
-            use_norm (bool). Whether to include BatchNorm.
-            num_filters (list[int]). Number of features in each of the
+        feat_channels (list[int]). Number of features in each of the
            N PFNLayers.
        with_distance (bool). Whether to include Euclidean distance
            to points.
        voxel_size (list[float]). Size of voxels, only utilize x and y
            size.
-            point_cloud_range (list[float>]). Point cloud range, only
-                utilize x and y min.
+        point_cloud_range (list[float]). Point cloud range, only
+            utilizes x and y min.
    """

+    def __init__(self,
+                 in_channels=4,
+                 feat_channels=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max'):
        super(PillarFeatureNet, self).__init__()
-        assert len(num_filters) > 0
+        assert len(feat_channels) > 0
        if with_cluster_center:
-            num_input_features += 3
+            in_channels += 3
        if with_voxel_center:
-            num_input_features += 2
+            in_channels += 2
        if with_distance:
-            num_input_features += 1
+            in_channels += 1
        self._with_distance = with_distance
        self._with_cluster_center = with_cluster_center
        self._with_voxel_center = with_voxel_center

        # Create PillarFeatureNet layers
-        self.num_input_features = num_input_features
-        num_filters = [num_input_features] + list(num_filters)
+        self.in_channels = in_channels
+        feat_channels = [in_channels] + list(feat_channels)
        pfn_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
-            if i < len(num_filters) - 2:
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i < len(feat_channels) - 2:
                last_layer = False
            else:
                last_layer = True
@@ -65,7 +64,7 @@ class PillarFeatureNet(nn.Module):
                PFNLayer(
                    in_filters,
                    out_filters,
-                    use_norm,
+                    norm_cfg=norm_cfg,
                    last_layer=last_layer,
                    mode=mode))
        self.pfn_layers = nn.ModuleList(pfn_layers)
@@ -122,9 +121,8 @@ class PillarFeatureNet(nn.Module):
 class DynamicPillarFeatureNet(PillarFeatureNet):

    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=(64, ),
+                 in_channels=4,
+                 feat_channels=(64, ),
                 with_distance=False,
                 with_cluster_center=True,
                 with_voxel_center=True,
@@ -138,23 +136,23 @@ class DynamicPillarFeatureNet(PillarFeatureNet):
        """

        super(DynamicPillarFeatureNet, self).__init__(
-            num_input_features,
-            use_norm,
-            num_filters,
+            in_channels,
+            feat_channels,
            with_distance,
            with_cluster_center=with_cluster_center,
            with_voxel_center=with_voxel_center,
            voxel_size=voxel_size,
            point_cloud_range=point_cloud_range,
+            norm_cfg=norm_cfg,
            mode=mode)

-        num_filters = [self.num_input_features] + list(num_filters)
+        feat_channels = [self.in_channels] + list(feat_channels)
        pfn_layers = []
        # TODO: currently only support one PFNLayer

-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
            if i > 0:
                in_filters *= 2
            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
@@ -235,145 +233,3 @@ class DynamicPillarFeatureNet(PillarFeatureNet):
                features = torch.cat([point_feats, feat_per_point], dim=1)

        return voxel_feats, voxel_coors
-
-
-@VOXEL_ENCODERS.register_module()
-class AlignedPillarFeatureNet(nn.Module):
-
-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 mode='max'):
-        """ Pillar Feature Net.
-
-        The network prepares the pillar features and performs forward pass
-        through PFNLayers.
-
-        Args:
-            num_input_features (int): Number of input features, either x, y, z
-                or x, y, z, r.
-            use_norm (bool): Whether to include BatchNorm.
-            num_filters (list[int]): Number of features in each of the N
-                PFNLayers.
-            with_distance (bool): Whether to include Euclidean distance to
-                points.
-            voxel_size (list[float]): Size of voxels, only utilize x and y
-                size.
-            point_cloud_range: (list[float]): Point cloud range, only
-                utilize x and y min.
-        """
-
-        super(AlignedPillarFeatureNet, self).__init__()
-
-        assert len(num_filters) > 0
-        if with_cluster_center:
-            print('Use cluster center')
-            num_input_features += 3
-        if with_voxel_center:
-            print('Use voxel center')
-            num_input_features += 2
-        if with_distance:
-            num_input_features += 1
-        self._with_distance = with_distance
-        self._with_cluster_center = with_cluster_center
-        self._with_voxel_center = with_voxel_center
-
-        # Create PillarFeatureNet layers
-        num_filters = [num_input_features] + list(num_filters)
-        pfn_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
-            if i < len(num_filters) - 2:
-                last_layer = False
-            else:
-                last_layer = True
-            pfn_layers.append(
-                PFNLayer(
-                    in_filters,
-                    out_filters,
-                    use_norm,
-                    last_layer=last_layer,
-                    mode=mode))
-        self.pfn_layers = nn.ModuleList(pfn_layers)
-
-        # Need pillar (voxel) size and x/y offset in order to
-        # calculate pillar offset
-        self.vx = voxel_size[0]
-        self.vy = voxel_size[1]
-        self.vz = voxel_size[2]
-        self.x_offset = self.vx / 2 + point_cloud_range[0]
-        self.y_offset = self.vy / 2 + point_cloud_range[1]
-        self.z_offset = self.vz / 2 + point_cloud_range[2]
-
-    def forward(self, features, num_points, coors):
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            points_mean = features[:, :, :3].sum(
-                dim=1, keepdim=True) / num_points.type_as(features).view(
-                    -1, 1, 1)
-            f_cluster = features[:, :, :3] - points_mean
-            features_ls.append(f_cluster)
-
-        x_distance = features[:, :, 0] - (
-            coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
-            self.x_offset)
-        y_distance = features[:, :, 1] - (
-            coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
-            self.y_offset)
-        z_distance = features[:, :, 2] - (
-            coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
-            self.z_offset)
-
-        normed_x_distance = 1 - torch.abs(x_distance / self.vx)
-        normed_y_distance = 1 - torch.abs(y_distance / self.vy)
-        normed_z_distance = 1 - torch.abs(z_distance / self.vz)
-
-        x_mask = torch.gt(normed_x_distance, 0).type_as(features)
-        y_mask = torch.gt(normed_y_distance, 0).type_as(features)
-        z_mask = torch.gt(normed_z_distance, 0).type_as(features)
-
-        nonzero_points_mask = x_mask.mul(y_mask).mul(z_mask)
-        aligned_distance = normed_x_distance.mul(normed_y_distance).mul(
-            normed_z_distance).mul(nonzero_points_mask)
-
-        # Find distance of x, y, and z from pillar center
-        if self._with_voxel_center:
-            f_center = features[:, :, :2]
-            f_center[:, :, 0] = f_center[:, :, 0] - (
-                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
-                self.x_offset)
-            f_center[:, :, 1] = f_center[:, :, 1] - (
-                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
-                self.y_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-
-        # The feature decorations were calculated without regard to
-        # whether pillar was empty. Need to ensure that
-        # empty pillars remain set to zeros.
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        features *= mask
-
-        for pfn in self.pfn_layers:
-            if pfn.last_vfe:
-                features = pfn(features, aligned_distance)
-            else:
-                features = pfn(features)
-
-        return features.squeeze()
--- a/mmdet3d/models/voxel_encoders/utils.py
+++ b/mmdet3d/models/voxel_encoders/utils.py
@@ -4,28 +4,15 @@ from torch import nn
 from torch.nn import functional as F


-class Empty(nn.Module):
-
-    def __init__(self, *args, **kwargs):
-        super(Empty, self).__init__()
-
-    def forward(self, *args, **kwargs):
-        if len(args) == 1:
-            return args[0]
-        elif len(args) == 0:
-            return None
-        return args
-
-
 def get_paddings_indicator(actual_num, max_num, axis=0):
    """Create boolean mask by actually number of a padded tensor.

    Args:
-        actual_num ([type]): [description]
-        max_num ([type]): [description]
+        actual_num (torch.Tensor): Actual number of points in each voxel.
+        max_num (int): Max number of points in each voxel

    Returns:
-        [type]: [description]
+        torch.Tensor: Mask indicates which points are valid inside a voxel.
    """
    actual_num = torch.unsqueeze(actual_num, axis + 1)
    # tiled_actual_num: [N, M, 1]
@@ -52,13 +39,9 @@ class VFELayer(nn.Module):
        self.cat_max = cat_max
        self.max_out = max_out
        # self.units = int(out_channels / 2)
-        if norm_cfg:
-            norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels)
-            self.norm = norm_layer
+
+        self.norm = build_norm_layer(norm_cfg, out_channels)[1]
        self.linear = nn.Linear(in_channels, out_channels, bias=False)
-        else:
-            self.norm = Empty(out_channels)
-            self.linear = nn.Linear(in_channels, out_channels, bias=True)

    def forward(self, inputs):
        # [K, T, 7] tensordot [7, units] = [K, T, units]
@@ -89,7 +72,7 @@ class PFNLayer(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
-                 use_norm=True,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
                 last_layer=False,
                 mode='max'):
        """ Pillar Feature Net Layer.
@@ -100,9 +83,11 @@ class PFNLayer(nn.Module):
        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
-            use_norm (bool): Whether to include BatchNorm.
+            norm_cfg (dict): Config dict of normalization layers
            last_layer (bool): If last_layer, there is no concatenation of
                features.
+            mode (str): Pooling model to gather features inside voxels.
+                Default to 'max'.
        """

        super().__init__()
@@ -112,13 +97,10 @@ class PFNLayer(nn.Module):
            out_channels = out_channels // 2
        self.units = out_channels

-        if use_norm:
-            self.norm = nn.BatchNorm1d(self.units, eps=1e-3, momentum=0.01)
+        self.norm = build_norm_layer(norm_cfg, self.units)[1]
        self.linear = nn.Linear(in_channels, self.units, bias=False)
-        else:
-            self.norm = Empty(self.unints)
-            self.linear = nn.Linear(in_channels, self.units, bias=True)

+        assert mode in ['max', 'avg']
        self.mode = mode

    def forward(self, inputs, num_voxels=None, aligned_distance=None):

--- a/mmdet3d/models/voxel_encoders/voxel_encoder.py
+++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py
 import torch
 from mmcv.cnn import build_norm_layer
 from torch import nn
-from torch.nn import functional as F

 from mmdet3d.ops import DynamicScatter
 from .. import builder
 from ..registry import VOXEL_ENCODERS
-from .utils import Empty, VFELayer, get_paddings_indicator
+from .utils import VFELayer, get_paddings_indicator


 @VOXEL_ENCODERS.register_module()
-class VoxelFeatureExtractor(nn.Module):
+class HardSimpleVFE(nn.Module):
+    """Simple voxel feature encoder used in SECOND

-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=[32, 128],
-                 with_distance=False,
-                 name='VoxelFeatureExtractor'):
-        super(VoxelFeatureExtractor, self).__init__()
-        self.name = name
-        assert len(num_filters) == 2
-        num_input_features += 3  # add mean features
-        if with_distance:
-            num_input_features += 1
-        self._with_distance = with_distance
-        self.vfe1 = VFELayer(num_input_features, num_filters[0], use_norm)
-        self.vfe2 = VFELayer(num_filters[0], num_filters[1], use_norm)
-
-        if use_norm:
-            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=False)
-            self.norm = nn.BatchNorm1d(num_filters[1], eps=1e-3, momentum=0.01)
-        else:
-            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=True)
-            self.norm = Empty(num_filters[1])
-
-    def forward(self, features, num_voxels, **kwargs):
-        # features: [concated_num_points, num_voxel_size, 3(4)]
-        # num_voxels: [concated_num_points]
-        # t = time.time()
-        # torch.cuda.synchronize()
-
-        points_mean = features[:, :, :3].sum(
-            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
-        features_relative = features[:, :, :3] - points_mean
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features = torch.cat([features, features_relative, points_dist],
-                                 dim=-1)
-        else:
-            features = torch.cat([features, features_relative], dim=-1)
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        # mask = features.max(dim=2, keepdim=True)[0] != 0
-
-        # torch.cuda.synchronize()
-        # print("vfe prep forward time", time.time() - t)
-        x = self.vfe1(features)
-        x *= mask
-        x = self.vfe2(x)
-        x *= mask
-        x = self.linear(x)
-        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
-                                                               1).contiguous()
-        x = F.relu(x)
-        x *= mask
-        # x: [concated_num_points, num_voxel_size, 128]
-        voxelwise = torch.max(x, dim=1)[0]
-        return voxelwise
-
-
-@VOXEL_ENCODERS.register_module()
-class VoxelFeatureExtractorV2(nn.Module):
-
-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=[32, 128],
-                 with_distance=False,
-                 name='VoxelFeatureExtractor'):
-        super(VoxelFeatureExtractorV2, self).__init__()
-        self.name = name
-        assert len(num_filters) > 0
-        num_input_features += 3
-        if with_distance:
-            num_input_features += 1
-        self._with_distance = with_distance
-
-        num_filters = [num_input_features] + num_filters
-        filters_pairs = [[num_filters[i], num_filters[i + 1]]
-                         for i in range(len(num_filters) - 1)]
-        self.vfe_layers = nn.ModuleList(
-            [VFELayer(i, o, use_norm) for i, o in filters_pairs])
-
-        if use_norm:
-            self.linear = nn.Linear(
-                num_filters[-1], num_filters[-1], bias=False)
-            self.norm = nn.BatchNorm1d(
-                num_filters[-1], eps=1e-3, momentum=0.01)
-        else:
-            self.linear = nn.Linear(
-                num_filters[-1], num_filters[-1], bias=True)
-            self.norm = Empty(num_filters[-1])
-
-    def forward(self, features, num_voxels, **kwargs):
-        # features: [concated_num_points, num_voxel_size, 3(4)]
-        # num_voxels: [concated_num_points]
-        points_mean = features[:, :, :3].sum(
-            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
-        features_relative = features[:, :, :3] - points_mean
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features = torch.cat([features, features_relative, points_dist],
-                                 dim=-1)
-        else:
-            features = torch.cat([features, features_relative], dim=-1)
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        for vfe in self.vfe_layers:
-            features = vfe(features)
-            features *= mask
-        features = self.linear(features)
-        features = self.norm(features.permute(0, 2, 1).contiguous()).permute(
-            0, 2, 1).contiguous()
-        features = F.relu(features)
-        features *= mask
-        # x: [concated_num_points, num_voxel_size, 128]
-        voxelwise = torch.max(features, dim=1)[0]
-        return voxelwise
-
-
-@VOXEL_ENCODERS.register_module()
-class VoxelFeatureExtractorV3(nn.Module):
+    It simply averages the values of points in a voxel.
+    """

-    def __init__(self,
-                 num_input_features=4,
-                 use_norm=True,
-                 num_filters=[32, 128],
-                 with_distance=False,
-                 name='VoxelFeatureExtractor'):
-        super(VoxelFeatureExtractorV3, self).__init__()
-        self.name = name
+    def __init__(self):
+        super(HardSimpleVFE, self).__init__()

    def forward(self, features, num_points, coors):
        # features: [concated_num_points, num_voxel_size, 3(4)]
@@ -153,13 +27,21 @@ class VoxelFeatureExtractorV3(nn.Module):


 @VOXEL_ENCODERS.register_module()
-class DynamicVFEV3(nn.Module):
+class DynamicSimpleVFE(nn.Module):
+    """Simple dynamic voxel feature encoder used in DV-SECOND
+
+    It simply averages the values of points in a voxel.
+    But the number of points in a voxel is dynamic and varies.
+
+    Args:
+        voxel_size (tupe[float]): Size of a single voxel
+        point_cloud_range (tuple[float]): Range of the point cloud and voxels
+    """

    def __init__(self,
-                 num_input_features=4,
                 voxel_size=(0.2, 0.2, 4),
                 point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
-        super(DynamicVFEV3, self).__init__()
+        super(DynamicSimpleVFE, self).__init__()
        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)

    @torch.no_grad()
@@ -172,10 +54,37 @@ class DynamicVFEV3(nn.Module):

 @VOXEL_ENCODERS.register_module()
 class DynamicVFE(nn.Module):
+    """Dynamic Voxel feature encoder used in DV-SECOND
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+    The number of points inside the voxel varies.
+
+    Args:
+        in_channels (int): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int)): Channels of features in VFE.
+        with_distance (bool): Whether to use the L2 distance of points to the
+            origin point. Default False.
+        with_cluster_center (bool): Whether to use the distance to cluster
+            center of points inside a voxel. Default to False.
+        with_voxel_center (bool): Whether to use the distance to center of
+            voxel for each points inside a voxel. Default to False.
+        voxel_size (tuple[float]): Size of a single voxel. Default to
+            (0.2, 0.2, 4).
+        point_cloud_range (tuple[float]): The range of points or voxels.
+            Default to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict): Config dict of normalization layers.
+        mode (str): The mode when pooling features of points inside a voxel.
+            Available options include 'max' and 'avg'. Default to 'max'.
+        fusion_layer (dict | None): The config dict of fusion layer used in
+            multi-modal detectors. Default to None.
+        return_point_feats (bool): Whether to return the features of each
+            points. Default to False.
+    """

    def __init__(self,
-                 num_input_features=4,
-                 num_filters=[],
+                 in_channels=4,
+                 feat_channels=[],
                 with_distance=False,
                 with_cluster_center=False,
                 with_voxel_center=False,
@@ -186,14 +95,15 @@ class DynamicVFE(nn.Module):
                 fusion_layer=None,
                 return_point_feats=False):
        super(DynamicVFE, self).__init__()
-        assert len(num_filters) > 0
+        assert mode in ['avg', 'max']
+        assert len(feat_channels) > 0
        if with_cluster_center:
-            num_input_features += 3
+            in_channels += 3
        if with_voxel_center:
-            num_input_features += 3
+            in_channels += 3
        if with_distance:
-            num_input_features += 3
-        self.num_input_features = num_input_features
+            in_channels += 3
+        self.in_channels = in_channels
        self._with_distance = with_distance
        self._with_cluster_center = with_cluster_center
        self._with_voxel_center = with_voxel_center
@@ -209,11 +119,11 @@ class DynamicVFE(nn.Module):
        self.point_cloud_range = point_cloud_range
        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)

-        num_filters = [self.num_input_features] + list(num_filters)
+        feat_channels = [self.in_channels] + list(feat_channels)
        vfe_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
            if i > 0:
                in_filters *= 2
            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
@@ -232,6 +142,16 @@ class DynamicVFE(nn.Module):
            self.fusion_layer = builder.build_fusion_layer(fusion_layer)

    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        """Map voxel features to its corresponding points.
+
+        Args:
+            pts_coors (torch.Tensor): Voxel coordinate of each point.
+            voxel_mean (torch.Tensor): Voxel features to be mapped.
+            voxel_coors (torch.Tensor): Coordinates of valid voxels
+
+        Returns:
+            torch.Tensor: Features or centers of each point.
+        """
        # Step 1: scatter voxel into canvas
        # Calculate necessary things for canvas creation
        canvas_z = int(
@@ -269,9 +189,21 @@ class DynamicVFE(nn.Module):
                points=None,
                img_feats=None,
                img_meta=None):
-        """
-        features (torch.Tensor): NxC
-        coors (torch.Tensor): Nx(1+NDim)
+        """Forward functions
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is NxC.
+            coors (torch.Tensor): Coordinates of voxels, shape is  Nx(1+NDim).
+            points (list[torch.Tensor], optional): Raw points used to guide the
+                multi-modality fusion. Defaults to None.
+            img_feats (list[torch.Tensor], optional): Image fetures used for
+                multi-modality fusion. Defaults to None.
+            img_meta (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
        """
        features_ls = [features]
        # Find distance of x, y, and z from cluster center
@@ -320,10 +252,36 @@ class DynamicVFE(nn.Module):

 @VOXEL_ENCODERS.register_module()
 class HardVFE(nn.Module):
+    """Voxel feature encoder used in DV-SECOND
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+
+    Args:
+        in_channels (int): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int)): Channels of features in VFE.
+        with_distance (bool): Whether to use the L2 distance of points to the
+            origin point. Default False.
+        with_cluster_center (bool): Whether to use the distance to cluster
+            center of points inside a voxel. Default to False.
+        with_voxel_center (bool): Whether to use the distance to center of
+            voxel for each points inside a voxel. Default to False.
+        voxel_size (tuple[float]): Size of a single voxel. Default to
+            (0.2, 0.2, 4).
+        point_cloud_range (tuple[float]): The range of points or voxels.
+            Default to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict): Config dict of normalization layers.
+        mode (str): The mode when pooling features of points inside a voxel.
+            Available options include 'max' and 'avg'. Default to 'max'.
+        fusion_layer (dict | None): The config dict of fusion layer used in
+            multi-modal detectors. Default to None.
+        return_point_feats (bool): Whether to return the features of each
+            points. Default to False.
+    """

    def __init__(self,
-                 num_input_features=4,
-                 num_filters=[],
+                 in_channels=4,
+                 feat_channels=[],
                 with_distance=False,
                 with_cluster_center=False,
                 with_voxel_center=False,
@@ -334,14 +292,14 @@ class HardVFE(nn.Module):
                 fusion_layer=None,
                 return_point_feats=False):
        super(HardVFE, self).__init__()
-        assert len(num_filters) > 0
+        assert len(feat_channels) > 0
        if with_cluster_center:
-            num_input_features += 3
+            in_channels += 3
        if with_voxel_center:
-            num_input_features += 3
+            in_channels += 3
        if with_distance:
-            num_input_features += 3
-        self.num_input_features = num_input_features
+            in_channels += 3
+        self.in_channels = in_channels
        self._with_distance = with_distance
        self._with_cluster_center = with_cluster_center
        self._with_voxel_center = with_voxel_center
@@ -357,16 +315,16 @@ class HardVFE(nn.Module):
        self.point_cloud_range = point_cloud_range
        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)

-        num_filters = [self.num_input_features] + list(num_filters)
+        feat_channels = [self.in_channels] + list(feat_channels)
        vfe_layers = []
-        for i in range(len(num_filters) - 1):
-            in_filters = num_filters[i]
-            out_filters = num_filters[i + 1]
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
            if i > 0:
                in_filters *= 2
            # TODO: pass norm_cfg to VFE
            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
-            if i == (len(num_filters) - 2):
+            if i == (len(feat_channels) - 2):
                cat_max = False
                max_out = True
                if fusion_layer:
@@ -394,9 +352,20 @@ class HardVFE(nn.Module):
                coors,
                img_feats=None,
                img_meta=None):
-        """
-        features (torch.Tensor): NxMxC
-        coors (torch.Tensor): Nx(1+NDim)
+        """Forward functions
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is MxNxC.
+            num_points (torch.Tensor): Number of points in each voxel.
+            coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim).
+            img_feats (list[torch.Tensor], optional): Image fetures used for
+                multi-modality fusion. Defaults to None.
+            img_meta (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
        """
        features_ls = [features]
        # Find distance of x, y, and z from cluster center
@@ -438,19 +407,29 @@ class HardVFE(nn.Module):

        for i, vfe in enumerate(self.vfe_layers):
            voxel_feats = vfe(voxel_feats)
-        if torch.isnan(voxel_feats).any():
-            import pdb
-            pdb.set_trace()
+
        if (self.fusion_layer is not None and img_feats is not None):
            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
                                                coors, img_feats, img_meta)
-        if torch.isnan(voxel_feats).any():
-            import pdb
-            pdb.set_trace()
+
        return voxel_feats

    def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
                         img_meta):
+        """Fuse image and point features with mask.
+
+        Args:
+            features (torch.Tensor): Features of voxel, usually it is the
+                values of points in voxels.
+            mask (torch.Tensor): Mask indicates valid features in each voxel.
+            voxel_feats (torch.Tensor): Features of voxels.
+            coors (torch.Tensor): Coordinates of each single voxel.
+            img_feats (list[torch.Tensor]): Multi-scale feature maps of image.
+            img_meta (list(dict)): Meta information of image and points.
+
+        Returns:
+            torch.Tensor: Fused features of each voxel.
+        """
        # the features is consist of a batch of points
        batch_size = coors[-1, 0] + 1
        points = []
@@ -459,20 +438,13 @@ class HardVFE(nn.Module):
            points.append(features[single_mask][mask[single_mask]])

        point_feats = voxel_feats[mask]
-        if torch.isnan(point_feats).any():
-            import pdb
-            pdb.set_trace()
        point_feats = self.fusion_layer(img_feats, points, point_feats,
                                        img_meta)
-        if torch.isnan(point_feats).any():
-            import pdb
-            pdb.set_trace()
+
        voxel_canvas = voxel_feats.new_zeros(
            size=(voxel_feats.size(0), voxel_feats.size(1),
                  point_feats.size(-1)))
        voxel_canvas[mask] = point_feats
        out = torch.max(voxel_canvas, dim=1)[0]
-        if torch.isnan(out).any():
-            import pdb
-            pdb.set_trace()
+
        return out
--- a/tests/test_box3d.py
+++ b/tests/test_box3d.py
@@ -3,7 +3,7 @@ import pytest
 import torch

 from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes,
-                               LiDARInstance3DBoxes)
+                               DepthInstance3DBoxes, LiDARInstance3DBoxes)


 def test_lidar_boxes3d():
@@ -13,6 +13,46 @@ def test_lidar_boxes3d():
    assert boxes.tensor.shape[0] == 0
    assert boxes.tensor.shape[1] == 7

+    # Test init with origin
+    gravity_center_box = np.array(
+        [[
+            -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 2.06200000e+00,
+            4.40900000e+00, 1.54800000e+00, -1.48801203e+00
+        ],
+         [
+             -2.66751588e+01, 5.59499564e+00, -9.14345860e-01, 3.43000000e-01,
+             4.58000000e-01, 7.82000000e-01, -4.62759755e+00
+         ],
+         [
+             -5.80979675e+00, 3.54092357e+01, 2.00889888e-01, 2.39600000e+00,
+             3.96900000e+00, 1.73200000e+00, -4.65203216e+00
+         ],
+         [
+             -3.13086877e+01, 1.09007628e+00, -1.94612112e-01, 1.94400000e+00,
+             3.85700000e+00, 1.72300000e+00, -2.81427027e+00
+         ]],
+        dtype=np.float32)
+    bottom_center_box = LiDARInstance3DBoxes(
+        gravity_center_box, origin=[0.5, 0.5, 0.5])
+    expected_tensor = torch.tensor(
+        [[
+            -5.24223238e+00, 4.00209696e+01, -4.76429619e-01, 2.06200000e+00,
+            4.40900000e+00, 1.54800000e+00, -1.48801203e+00
+        ],
+         [
+             -2.66751588e+01, 5.59499564e+00, -1.30534586e+00, 3.43000000e-01,
+             4.58000000e-01, 7.82000000e-01, -4.62759755e+00
+         ],
+         [
+             -5.80979675e+00, 3.54092357e+01, -6.65110112e-01, 2.39600000e+00,
+             3.96900000e+00, 1.73200000e+00, -4.65203216e+00
+         ],
+         [
+             -3.13086877e+01, 1.09007628e+00, -1.05611211e+00, 1.94400000e+00,
+             3.85700000e+00, 1.72300000e+00, -2.81427027e+00
+         ]])
+    assert torch.allclose(expected_tensor, bottom_center_box.tensor)
+
    # Test init with numpy array
    np_boxes = np.array(
        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
@@ -70,9 +110,19 @@ def test_lidar_boxes3d():
         [28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48, 4.7115927],
         [26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4, 4.8315926],
         [31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48, 0.35159278]])
-    boxes.flip()
+    boxes.flip('horizontal')
    assert torch.allclose(boxes.tensor, expected_tensor)

+    expected_tensor = torch.tensor(
+        [[-1.7802, -2.5162, -1.7501, 1.7500, 3.3900, 1.6500, -1.6616],
+         [-8.9594, -2.4567, -1.6357, 1.5400, 4.0100, 1.5700, -1.5216],
+         [-28.2967, 0.5558, -1.3033, 1.4700, 2.2300, 1.4800, -4.7116],
+         [-26.6690, -21.8230, -1.7361, 1.5600, 3.4800, 1.4000, -4.8316],
+         [-31.3198, -8.1621, -1.6218, 1.7400, 3.7700, 1.4800, -0.3516]])
+    boxes_flip_vert = boxes.clone()
+    boxes_flip_vert.flip('vertical')
+    assert torch.allclose(boxes_flip_vert.tensor, expected_tensor, 1e-4)
+
    # test box rotation
    expected_tensor = torch.tensor(
        [[1.0385344, -2.9020846, -1.7501148, 1.75, 3.39, 1.65, 1.9336663],
@@ -223,7 +273,7 @@ def test_lidar_boxes3d():
                                    [27.3398, -18.3976, 29.0896, -14.6065]])
    # the pytorch print loses some precision
    assert torch.allclose(
-        boxes.nearset_bev, expected_tensor, rtol=1e-4, atol=1e-7)
+        boxes.nearest_bev, expected_tensor, rtol=1e-4, atol=1e-7)

    # obtained by the print of the original implementation
    expected_tensor = torch.tensor([[[2.4093e+00, -4.4784e+00, -1.9169e+00],
@@ -269,6 +319,25 @@ def test_lidar_boxes3d():
    # the pytorch print loses some precision
    assert torch.allclose(boxes.corners, expected_tensor, rtol=1e-4, atol=1e-7)

+    # test new_box
+    new_box1 = boxes.new_box([[1, 2, 3, 4, 5, 6, 7]])
+    assert torch.allclose(
+        new_box1.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=boxes.tensor.dtype))
+    assert new_box1.device == boxes.device
+    assert new_box1.with_yaw == boxes.with_yaw
+    assert new_box1.box_dim == boxes.box_dim
+
+    new_box2 = boxes.new_box(np.array([[1, 2, 3, 4, 5, 6, 7]]))
+    assert torch.allclose(
+        new_box2.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=boxes.tensor.dtype))
+
+    new_box3 = boxes.new_box(torch.tensor([[1, 2, 3, 4, 5, 6, 7]]))
+    assert torch.allclose(
+        new_box3.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=boxes.tensor.dtype))
+

 def test_boxes_conversion():
    """Test the conversion of boxes between different modes.
@@ -284,6 +353,8 @@ def test_boxes_conversion():
         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
    cam_box_tensor = Box3DMode.convert(lidar_boxes.tensor, Box3DMode.LIDAR,
                                       Box3DMode.CAM)
+    expected_box = lidar_boxes.convert_to(Box3DMode.CAM)
+    assert torch.equal(expected_box.tensor, cam_box_tensor)

    # Some properties should be the same
    cam_boxes = CameraInstance3DBoxes(cam_box_tensor)
@@ -310,16 +381,10 @@ def test_boxes_conversion():
                                                Box3DMode.DEPTH, Box3DMode.CAM)
    assert torch.allclose(cam_box_tensor, depth_to_cam_box_tensor)

-    # test error raise with not supported conversion
-    with pytest.raises(NotImplementedError):
-        Box3DMode.convert(lidar_box_tensor, Box3DMode.LIDAR, Box3DMode.DEPTH)
-    with pytest.raises(NotImplementedError):
-        Box3DMode.convert(depth_box_tensor, Box3DMode.DEPTH, Box3DMode.LIDAR)
-
    # test similar mode conversion
    same_results = Box3DMode.convert(depth_box_tensor, Box3DMode.DEPTH,
                                     Box3DMode.DEPTH)
-    assert (same_results == depth_box_tensor).all()
+    assert torch.equal(same_results, depth_box_tensor)

    # test conversion with a given rt_mat
    camera_boxes = CameraInstance3DBoxes(
@@ -389,6 +454,35 @@ def test_boxes_conversion():
        rt_mat.inverse().numpy())
    assert np.allclose(np.array(cam_to_lidar_box), expected_tensor[0].numpy())

+    # test convert from depth to lidar
+    depth_boxes = torch.tensor(
+        [[2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 3.0693],
+         [1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601]],
+        dtype=torch.float32)
+    depth_boxes = DepthInstance3DBoxes(depth_boxes)
+    depth_to_lidar_box = depth_boxes.convert_to(Box3DMode.LIDAR)
+    expected_box = depth_to_lidar_box.convert_to(Box3DMode.DEPTH)
+    assert torch.equal(depth_boxes.tensor, expected_box.tensor)
+
+    lidar_to_depth_box = Box3DMode.convert(depth_to_lidar_box, Box3DMode.LIDAR,
+                                           Box3DMode.DEPTH)
+    assert torch.allclose(depth_boxes.tensor, lidar_to_depth_box.tensor)
+    assert torch.allclose(depth_boxes.volume, lidar_to_depth_box.volume)
+
+    # test convert from depth to camera
+    depth_to_cam_box = Box3DMode.convert(depth_boxes, Box3DMode.DEPTH,
+                                         Box3DMode.CAM)
+    cam_to_depth_box = Box3DMode.convert(depth_to_cam_box, Box3DMode.CAM,
+                                         Box3DMode.DEPTH)
+    expected_tensor = depth_to_cam_box.convert_to(Box3DMode.DEPTH)
+    assert torch.equal(expected_tensor.tensor, cam_to_depth_box.tensor)
+    assert torch.allclose(depth_boxes.tensor, cam_to_depth_box.tensor)
+    assert torch.allclose(depth_boxes.volume, cam_to_depth_box.volume)
+
+    with pytest.raises(NotImplementedError):
+        # assert invalid convert mode
+        Box3DMode.convert(depth_boxes, Box3DMode.DEPTH, 3)
+

 def test_camera_boxes3d():
    # Test init with numpy array
@@ -449,9 +543,19 @@ def test_camera_boxes3d():
             [26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4, 4.8315926],
             [31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48, 0.35159278]]),
        Box3DMode.LIDAR, Box3DMode.CAM)
-    boxes.flip()
+    boxes.flip('horizontal')
    assert torch.allclose(boxes.tensor, expected_tensor)

+    expected_tensor = torch.tensor(
+        [[2.5162, 1.7501, -1.7802, 3.3900, 1.6500, 1.7500, -1.6616],
+         [2.4567, 1.6357, -8.9594, 4.0100, 1.5700, 1.5400, -1.5216],
+         [-0.5558, 1.3033, -28.2967, 2.2300, 1.4800, 1.4700, -4.7116],
+         [21.8230, 1.7361, -26.6690, 3.4800, 1.4000, 1.5600, -4.8316],
+         [8.1621, 1.6218, -31.3198, 3.7700, 1.4800, 1.7400, -0.3516]])
+    boxes_flip_vert = boxes.clone()
+    boxes_flip_vert.flip('vertical')
+    assert torch.allclose(boxes_flip_vert.tensor, expected_tensor, 1e-4)
+
    # test box rotation
    expected_tensor = Box3DMode.convert(
        torch.tensor(
@@ -560,7 +664,7 @@ def test_camera_boxes3d():
    expected_tensor[:, 1::2] = lidar_expected_tensor[:, 0::2]
    # the pytorch print loses some precision
    assert torch.allclose(
-        boxes.nearset_bev, expected_tensor, rtol=1e-4, atol=1e-7)
+        boxes.nearest_bev, expected_tensor, rtol=1e-4, atol=1e-7)

    # obtained by the print of the original implementation
    expected_tensor = torch.tensor([[[3.2684e+00, 2.5769e-01, -7.7767e-01],
@@ -659,3 +763,130 @@ def test_boxes3d_overlaps():
        cam_boxes1.overlaps(cam_boxes1, boxes1)
    with pytest.raises(AssertionError):
        boxes1.overlaps(cam_boxes1, boxes1)
+
+
+def test_depth_boxes3d():
+    # test empty initialization
+    empty_boxes = []
+    boxes = DepthInstance3DBoxes(empty_boxes)
+    assert boxes.tensor.shape[0] == 0
+    assert boxes.tensor.shape[1] == 7
+
+    # Test init with numpy array
+    np_boxes = np.array(
+        [[1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601],
+         [2.3262, 3.3065, --0.44255, 0.8234, 0.5325, 1.0099, 2.9971]],
+        dtype=np.float32)
+    boxes_1 = DepthInstance3DBoxes(np_boxes)
+    assert torch.allclose(boxes_1.tensor, torch.from_numpy(np_boxes))
+
+    # test properties
+
+    assert boxes_1.volume.size(0) == 2
+    assert (boxes_1.center == boxes_1.bottom_center).all()
+    expected_tensor = torch.tensor([[1.4856, 2.5299, -0.1093],
+                                    [2.3262, 3.3065, 0.9475]])
+    assert torch.allclose(boxes_1.gravity_center, expected_tensor)
+    expected_tensor = torch.tensor([[1.4856, 2.5299, 0.9385, 2.1404, 3.0601],
+                                    [2.3262, 3.3065, 0.8234, 0.5325, 2.9971]])
+    assert torch.allclose(boxes_1.bev, expected_tensor)
+    expected_tensor = torch.tensor([[1.0164, 1.4597, 1.9548, 3.6001],
+                                    [1.9145, 3.0402, 2.7379, 3.5728]])
+    assert torch.allclose(boxes_1.nearest_bev, expected_tensor, 1e-4)
+    assert repr(boxes) == (
+        'DepthInstance3DBoxes(\n    tensor([], size=(0, 7)))')
+
+    # test init with torch.Tensor
+    th_boxes = torch.tensor(
+        [[2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 3.0693],
+         [1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601]],
+        dtype=torch.float32)
+    boxes_2 = DepthInstance3DBoxes(th_boxes)
+    assert torch.allclose(boxes_2.tensor, th_boxes)
+
+    # test clone/to/device
+    boxes_2 = boxes_2.clone()
+    boxes_1 = boxes_1.to(boxes_2.device)
+
+    # test box concatenation
+    expected_tensor = torch.tensor(
+        [[1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601],
+         [2.3262, 3.3065, --0.44255, 0.8234, 0.5325, 1.0099, 2.9971],
+         [2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 3.0693],
+         [1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601]])
+    boxes = DepthInstance3DBoxes.cat([boxes_1, boxes_2])
+    assert torch.allclose(boxes.tensor, expected_tensor)
+    # concatenate empty list
+    empty_boxes = DepthInstance3DBoxes.cat([])
+    assert empty_boxes.tensor.shape[0] == 0
+    assert empty_boxes.tensor.shape[-1] == 7
+
+    # test box flip
+    expected_tensor = torch.tensor(
+        [[-1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 0.0815],
+         [-2.3262, 3.3065, 0.4426, 0.8234, 0.5325, 1.0099, 0.1445],
+         [-2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 0.0723],
+         [-1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 0.0815]])
+    boxes.flip(bev_direction='horizontal')
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    expected_tensor = torch.tensor(
+        [[-1.4856, -2.5299, -0.5570, 0.9385, 2.1404, 0.8954, -0.0815],
+         [-2.3262, -3.3065, 0.4426, 0.8234, 0.5325, 1.0099, -0.1445],
+         [-2.4593, -2.5870, -0.4321, 0.8597, 0.6193, 1.0204, -0.0723],
+         [-1.4856, -2.5299, -0.5570, 0.9385, 2.1404, 0.8954, -0.0815]])
+    boxes.flip(bev_direction='vertical')
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+
+    # test box rotation
+    boxes_rot = boxes.clone()
+    expected_tensor = torch.tensor(
+        [[-1.6004, -2.4589, -0.5570, 0.9385, 2.1404, 0.8954, -0.0355],
+         [-2.4758, -3.1960, 0.4426, 0.8234, 0.5325, 1.0099, -0.0985],
+         [-2.5757, -2.4712, -0.4321, 0.8597, 0.6193, 1.0204, -0.0263],
+         [-1.6004, -2.4589, -0.5570, 0.9385, 2.1404, 0.8954, -0.0355]])
+    boxes_rot.rotate(-0.04599790655000615)
+    assert torch.allclose(boxes_rot.tensor, expected_tensor, 1e-3)
+
+    th_boxes = torch.tensor(
+        [[0.61211395, 0.8129094, 0.10563634, 1.497534, 0.16927195, 0.27956772],
+         [1.430009, 0.49797538, 0.9382923, 0.07694054, 0.9312509, 1.8919173]],
+        dtype=torch.float32)
+    boxes = DepthInstance3DBoxes(th_boxes, box_dim=6, with_yaw=False)
+    expected_tensor = torch.tensor([[
+        0.64884546, 0.78390356, 0.10563634, 1.50373348, 0.23795205, 0.27956772,
+        0
+    ],
+                                    [
+                                        1.45139421, 0.43169443, 0.93829232,
+                                        0.11967964, 0.93380373, 1.89191735, 0
+                                    ]])
+    boxes_3 = boxes.clone()
+    boxes_3.rotate(-0.04599790655000615)
+    assert torch.allclose(boxes_3.tensor, expected_tensor)
+    boxes.rotate(torch.tensor(-0.04599790655000615))
+    assert torch.allclose(boxes.tensor, expected_tensor)
+
+    # test bbox in_range_bev
+    expected_tensor = torch.tensor([1, 1], dtype=torch.bool)
+    mask = boxes.in_range_bev([0., -40., 70.4, 40.])
+    assert (mask == expected_tensor).all()
+    mask = boxes.nonempty()
+    assert (mask == expected_tensor).all()
+
+    expected_tensor = torch.tensor([[[-0.1030, 0.6649, 0.1056],
+                                     [-0.1030, 0.6649, 0.3852],
+                                     [-0.1030, 0.9029, 0.3852],
+                                     [-0.1030, 0.9029, 0.1056],
+                                     [1.4007, 0.6649, 0.1056],
+                                     [1.4007, 0.6649, 0.3852],
+                                     [1.4007, 0.9029, 0.3852],
+                                     [1.4007, 0.9029, 0.1056]],
+                                    [[1.3916, -0.0352, 0.9383],
+                                     [1.3916, -0.0352, 2.8302],
+                                     [1.3916, 0.8986, 2.8302],
+                                     [1.3916, 0.8986, 0.9383],
+                                     [1.5112, -0.0352, 0.9383],
+                                     [1.5112, -0.0352, 2.8302],
+                                     [1.5112, 0.8986, 2.8302],
+                                     [1.5112, 0.8986, 0.9383]]])
+    torch.allclose(boxes.corners, expected_tensor)
--- a/tests/test_pipeline/test_outdoor_pipeline.py
+++ b/tests/test_pipeline/test_outdoor_pipeline.py
+import numpy as np
+import torch
+
+from mmdet3d.core.bbox import LiDARInstance3DBoxes
+from mmdet3d.datasets.pipelines import Compose
+
+
+def test_outdoor_aug_pipeline():
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+    class_names = ['Car']
+    np.random.seed(0)
+
+    train_pipeline = [
+        dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
+        dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+        dict(
+            type='ObjectNoise',
+            num_try=100,
+            loc_noise_std=[1.0, 1.0, 0.5],
+            global_rot_range=[0.0, 0.0],
+            rot_uniform_noise=[-0.78539816, 0.78539816]),
+        dict(type='RandomFlip3D', flip_ratio=0.5),
+        dict(
+            type='GlobalRotScale',
+            rot_uniform_noise=[-0.78539816, 0.78539816],
+            scaling_uniform_noise=[0.95, 1.05]),
+        dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+        dict(type='PointShuffle'),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+    ]
+    pipeline = Compose(train_pipeline)
+
+    gt_bboxes_3d = LiDARInstance3DBoxes(
+        torch.tensor([
+            [
+                2.16902428e+01, -4.06038128e-02, -1.61906636e+00,
+                1.65999997e+00, 3.20000005e+00, 1.61000001e+00, -1.53999996e+00
+            ],
+            [
+                7.05006886e+00, -6.57459593e+00, -1.60107934e+00,
+                2.27999997e+00, 1.27799997e+01, 3.66000009e+00, 1.54999995e+00
+            ],
+            [
+                2.24698811e+01, -6.69203758e+00, -1.50118136e+00,
+                2.31999993e+00, 1.47299995e+01, 3.64000010e+00, 1.59000003e+00
+            ],
+            [
+                3.48291969e+01, -7.09058380e+00, -1.36622977e+00,
+                2.31999993e+00, 1.00400000e+01, 3.60999990e+00, 1.61000001e+00
+            ],
+            [
+                4.62394600e+01, -7.75838804e+00, -1.32405007e+00,
+                2.33999991e+00, 1.28299999e+01, 3.63000011e+00, 1.63999999e+00
+            ],
+            [
+                2.82966995e+01, -5.55755794e-01, -1.30332506e+00,
+                1.47000003e+00, 2.23000002e+00, 1.48000002e+00, -1.57000005e+00
+            ],
+            [
+                2.66690197e+01, 2.18230209e+01, -1.73605704e+00,
+                1.55999994e+00, 3.48000002e+00, 1.39999998e+00, -1.69000006e+00
+            ],
+            [
+                3.13197803e+01, 8.16214371e+00, -1.62177873e+00,
+                1.74000001e+00, 3.76999998e+00, 1.48000002e+00, 2.78999996e+00
+            ],
+            [
+                4.34395561e+01, -1.95209332e+01, -1.20757008e+00,
+                1.69000006e+00, 4.09999990e+00, 1.40999997e+00, -1.53999996e+00
+            ],
+            [
+                3.29882965e+01, -3.79360509e+00, -1.69245458e+00,
+                1.74000001e+00, 4.09000015e+00, 1.49000001e+00, -1.52999997e+00
+            ],
+            [
+                3.85469360e+01, 8.35060215e+00, -1.31423414e+00,
+                1.59000003e+00, 4.28000021e+00, 1.45000005e+00, 1.73000002e+00
+            ],
+            [
+                2.22492104e+01, -1.13536005e+01, -1.38272512e+00,
+                1.62000000e+00, 3.55999994e+00, 1.71000004e+00, 2.48000002e+00
+            ],
+            [
+                3.36115799e+01, -1.97708054e+01, -4.92827654e-01,
+                1.64999998e+00, 3.54999995e+00, 1.79999995e+00, -1.57000005e+00
+            ],
+            [
+                9.85029602e+00, -1.51294518e+00, -1.66834795e+00,
+                1.59000003e+00, 3.17000008e+00, 1.38999999e+00, -8.39999974e-01
+            ]
+        ],
+                     dtype=torch.float32))
+    gt_labels_3d = np.array([0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    results = dict(
+        pts_filename='tests/data/kitti/a.bin',
+        ann_info=dict(gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d),
+        bbox3d_fields=[],
+    )
+
+    output = pipeline(results)
+
+    expected_tensor = torch.tensor(
+        [[20.6514, -8.8250, -1.0816, 1.5893, 3.0637, 1.5414, -1.9216],
+         [7.9374, 4.9457, -1.2008, 2.1829, 12.2357, 3.5041, 1.6629],
+         [20.8115, -2.0273, -1.8893, 2.2212, 14.1026, 3.4850, 2.6513],
+         [32.3850, -5.2135, -1.1321, 2.2212, 9.6124, 3.4562, 2.6498],
+         [43.7022, -7.8316, -0.5090, 2.2403, 12.2836, 3.4754, 2.0146],
+         [25.3300, -9.6670, -1.0855, 1.4074, 2.1350, 1.4170, -0.7141],
+         [16.5414, -29.0583, -0.9768, 1.4936, 3.3318, 1.3404, -0.7153],
+         [24.6548, -18.9226, -1.3567, 1.6659, 3.6094, 1.4170, 1.3970],
+         [45.8403, 1.8183, -1.1626, 1.6180, 3.9254, 1.3499, -0.6886],
+         [30.6288, -8.4497, -1.4881, 1.6659, 3.9158, 1.4265, -0.7241],
+         [32.3316, -22.4611, -1.3131, 1.5223, 4.0977, 1.3882, 2.4186],
+         [22.4492, 3.2944, -2.1674, 1.5510, 3.4084, 1.6372, 0.3928],
+         [37.3824, 5.0472, -0.6579, 1.5797, 3.3988, 1.7233, -1.4862],
+         [8.9259, -1.2578, -1.6081, 1.5223, 3.0350, 1.3308, -1.7212]])
+    assert torch.allclose(
+        output['gt_bboxes_3d']._data.tensor, expected_tensor, atol=1e-3)
+
+
+def test_outdoor_velocity_aug_pipeline():
+    point_cloud_range = [-50, -50, -5, 50, 50, 3]
+    class_names = [
+        'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+        'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+    ]
+    np.random.seed(0)
+
+    train_pipeline = [
+        dict(type='LoadPointsFromFile', load_dim=4, use_dim=4),
+        dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+        dict(
+            type='GlobalRotScale',
+            rot_uniform_noise=[-0.3925, 0.3925],
+            scaling_uniform_noise=[0.95, 1.05],
+            trans_normal_noise=[0, 0, 0]),
+        dict(type='RandomFlip3D', flip_ratio=0.5),
+        dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+        dict(type='PointShuffle'),
+        dict(type='DefaultFormatBundle3D', class_names=class_names),
+        dict(
+            type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+    ]
+    pipeline = Compose(train_pipeline)
+
+    gt_bboxes_3d = LiDARInstance3DBoxes(
+        torch.tensor(
+            [[
+                -5.2422e+00, 4.0021e+01, -4.7643e-01, 2.0620e+00, 4.4090e+00,
+                1.5480e+00, -1.4880e+00, 8.5338e-03, 4.4934e-02
+            ],
+             [
+                 -2.6675e+01, 5.5950e+00, -1.3053e+00, 3.4300e-01, 4.5800e-01,
+                 7.8200e-01, -4.6276e+00, -4.3284e-04, -1.8543e-03
+             ],
+             [
+                 -5.8098e+00, 3.5409e+01, -6.6511e-01, 2.3960e+00, 3.9690e+00,
+                 1.7320e+00, -4.6520e+00, 0.0000e+00, 0.0000e+00
+             ],
+             [
+                 -3.1309e+01, 1.0901e+00, -1.0561e+00, 1.9440e+00, 3.8570e+00,
+                 1.7230e+00, -2.8143e+00, -2.7606e-02, -8.0573e-02
+             ],
+             [
+                 -4.5642e+01, 2.0136e+01, -2.4681e-02, 1.9870e+00, 4.4400e+00,
+                 1.9420e+00, 2.8336e-01, 0.0000e+00, 0.0000e+00
+             ],
+             [
+                 -5.1617e+00, 1.8305e+01, -1.0879e+00, 2.3230e+00, 4.8510e+00,
+                 1.3710e+00, -1.5803e+00, 0.0000e+00, 0.0000e+00
+             ],
+             [
+                 -2.5285e+01, 4.1442e+00, -1.2713e+00, 1.7550e+00, 1.9890e+00,
+                 2.2200e+00, -4.4900e+00, -3.1784e-02, -1.5291e-01
+             ],
+             [
+                 -2.2611e+00, 1.9170e+01, -1.1452e+00, 9.1900e-01, 1.1230e+00,
+                 1.9310e+00, 4.7790e-02, 6.7684e-02, -1.7537e+00
+             ],
+             [
+                 -6.5878e+01, 1.3500e+01, -2.2528e-01, 1.8200e+00, 3.8520e+00,
+                 1.5450e+00, -2.8757e+00, 0.0000e+00, 0.0000e+00
+             ],
+             [
+                 -5.4490e+00, 2.8363e+01, -7.7275e-01, 2.2360e+00, 3.7540e+00,
+                 1.5590e+00, -4.6520e+00, -7.9736e-03, 7.7207e-03
+             ]],
+            dtype=torch.float32),
+        box_dim=9)
+
+    gt_labels_3d = np.array([0, 8, 0, 0, 0, 0, -1, 7, 0, 0])
+    results = dict(
+        pts_filename='tests/data/kitti/a.bin',
+        ann_info=dict(gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d),
+        bbox3d_fields=[],
+    )
+
+    output = pipeline(results)
+
+    expected_tensor = torch.tensor(
+        [[
+            -3.7849e+00, -4.1057e+01, -4.8668e-01, 2.1064e+00, 4.5039e+00,
+            1.5813e+00, -1.6919e+00, 1.0469e-02, -4.5533e-02
+        ],
+         [
+             -2.7010e+01, -6.7551e+00, -1.3334e+00, 3.5038e-01, 4.6786e-01,
+             7.9883e-01, 1.4477e+00, -5.1440e-04, 1.8758e-03
+         ],
+         [
+             -4.5448e+00, -3.6372e+01, -6.7942e-01, 2.4476e+00, 4.0544e+00,
+             1.7693e+00, 1.4721e+00, 0.0000e+00, -0.0000e+00
+         ],
+         [
+             -3.1916e+01, -2.3379e+00, -1.0788e+00, 1.9858e+00, 3.9400e+00,
+             1.7601e+00, -3.6564e-01, -3.1333e-02, 8.1166e-02
+         ],
+         [
+             -4.5802e+01, -2.2340e+01, -2.5213e-02, 2.0298e+00, 4.5355e+00,
+             1.9838e+00, 2.8199e+00, 0.0000e+00, -0.0000e+00
+         ],
+         [
+             -4.5526e+00, -1.8887e+01, -1.1114e+00, 2.3730e+00, 4.9554e+00,
+             1.4005e+00, -1.5997e+00, 0.0000e+00, -0.0000e+00
+         ],
+         [
+             -2.5648e+01, -5.2197e+00, -1.2987e+00, 1.7928e+00, 2.0318e+00,
+             2.2678e+00, 1.3100e+00, -3.8428e-02, 1.5485e-01
+         ],
+         [
+             -1.5578e+00, -1.9657e+01, -1.1699e+00, 9.3878e-01, 1.1472e+00,
+             1.9726e+00, 3.0555e+00, 4.5907e-04, 1.7928e+00
+         ],
+         [
+             -4.4522e+00, -2.9166e+01, -7.8938e-01, 2.2841e+00, 3.8348e+00,
+             1.5925e+00, 1.4721e+00, -7.8371e-03, -8.1931e-03
+         ]])
+    assert torch.allclose(
+        output['gt_bboxes_3d']._data.tensor, expected_tensor, atol=1e-3)
--- a/tests/test_semantic_heads.py
+++ b/tests/test_semantic_heads.py
 import pytest
 import torch

+from mmdet3d.core.bbox import LiDARInstance3DBoxes
+

 def test_PointwiseSemanticHead():
    # PointwiseSemanticHead only support gpu version currently.
@@ -47,19 +49,29 @@ def test_PointwiseSemanticHead():
         [1, 35, 930, 469]],
        dtype=torch.int32).cuda()  # n, 4(batch, ind_x, ind_y, ind_z)
    voxel_dict = dict(voxel_centers=voxel_centers, coors=coordinates)
-    gt_bboxes = list(
+    gt_bboxes = [
+        LiDARInstance3DBoxes(
+            torch.tensor(
+                [[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, -0.9091]],
+                dtype=torch.float32).cuda()),
+        LiDARInstance3DBoxes(
            torch.tensor(
-            [[[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, -0.9091]],
-             [[16.9107, 9.7925, -1.9201, 1.6097, 3.2786, 1.5307, -2.4056]]],
+                [[16.9107, 9.7925, -1.9201, 1.6097, 3.2786, 1.5307, -2.4056]],
                dtype=torch.float32).cuda())
+    ]
+    # batch size is 2 in the unit test
    gt_labels = list(torch.tensor([[0], [1]], dtype=torch.int64).cuda())

    # test get_targets
    target_dict = self.get_targets(voxel_dict, gt_bboxes, gt_labels)
+
    assert target_dict['seg_targets'].shape == torch.Size(
        [voxel_features.shape[0]])
+    assert torch.allclose(target_dict['seg_targets'],
+                          target_dict['seg_targets'].new_tensor([3, -1, 3, 3]))
    assert target_dict['part_targets'].shape == torch.Size(
        [voxel_features.shape[0], 3])
+    assert target_dict['part_targets'].sum() == 0

    # test loss
    loss_dict = self.loss(feats_dict, target_dict)
@@ -67,7 +79,3 @@ def test_PointwiseSemanticHead():
    assert loss_dict['loss_part'] == 0  # no points in gt_boxes
    total_loss = loss_dict['loss_seg'] + loss_dict['loss_part']
    total_loss.backward()
-
-
-if __name__ == '__main__':
-    test_PointwiseSemanticHead()