Initial commit

d1aac35d · zhangwenwei · d1aac35d · d1aac35d · d1aac35d · d1aac35d
Commit d1aac35d authored Apr 14, 2020 by zhangwenwei
20 changed files
--- a/mmdet3d/models/fusion_layers/__init__.py
+++ b/mmdet3d/models/fusion_layers/__init__.py
+from .point_fusion import PointFusion
+
+__all__ = ['PointFusion']
--- a/mmdet3d/models/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/fusion_layers/point_fusion.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init
+
+from mmdet3d.models.utils import ConvModule
+from ..plugins import NonLocal2D
+from ..registry import FUSION_LAYERS
+
+
+def point_sample(
+    img_features,
+    points,
+    lidar2img_rt,
+    pcd_rotate_mat,
+    img_scale_factor,
+    img_crop_offset,
+    pcd_trans_factor,
+    pcd_scale_factor,
+    pcd_flip,
+    img_flip,
+    img_pad_shape,
+    img_shape,
+    aligned=True,
+    padding_mode='zeros',
+    align_corners=True,
+):
+    """sample image features using point coordinates
+
+    Arguments:
+        img_features (Tensor): 1xCxHxW image features
+        points (Tensor): Nx3 point cloud coordinates
+        P (Tensor): 4x4 transformation matrix
+        scale_factor (Tensor): scale_factor of images
+        img_pad_shape (int, int): int tuple indicates the h & w after padding,
+            this is necessary to obtain features in feature map
+        img_shape (int, int): int tuple indicates the h & w before padding
+            after scaling, this is necessary for flipping coordinates
+    return:
+        (Tensor): NxC image features sampled by point coordinates
+    """
+    # aug order: flip -> trans -> scale -> rot
+    # The transformation follows the augmentation order in data pipeline
+    if pcd_flip:
+        # if the points are flipped, flip them back first
+        points[:, 1] = -points[:, 1]
+
+    points -= pcd_trans_factor
+    # the points should be scaled to the original scale in velo coordinate
+    points /= pcd_scale_factor
+    # the points should be rotated back
+    # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not exactly an identity
+    # matrix, use angle to create the inverse rot matrix neither.
+    points = points @ pcd_rotate_mat.inverse()
+
+    # project points from velo coordinate to camera coordinate
+    num_points = points.shape[0]
+    pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1)
+    pts_2d = pts_4d @ lidar2img_rt.t()
+
+    # cam_points is Tensor of Nx4 whose last column is 1
+    # transform camera coordinate to image coordinate
+
+    pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+
+    # img transformation: scale -> crop -> flip
+    # the image is resized by img_scale_factor
+    img_coors = pts_2d[:, 0:2] * img_scale_factor  # Nx2
+    img_coors -= img_crop_offset
+
+    # grid sample, the valid grid range should be in [-1,1]
+    coor_x, coor_y = torch.split(img_coors, 1, dim=1)  # each is Nx1
+
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        orig_h, orig_w = img_shape
+        coor_x = orig_w - coor_x
+
+    h, w = img_pad_shape
+    coor_y = coor_y / h * 2 - 1
+    coor_x = coor_x / w * 2 - 1
+    grid = torch.cat([coor_x, coor_y],
+                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
+
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    point_features = F.grid_sample(
+        img_features,
+        grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCx1xN feats
+
+    return point_features.squeeze().t()
+
+
+@FUSION_LAYERS.register_module
+class PointFusion(nn.Module):
+    """Fuse image features from fused single scale features
+    """
+
+    def __init__(self,
+                 img_channels,
+                 pts_channels,
+                 mid_channels,
+                 out_channels,
+                 img_levels=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 activation=None,
+                 activate_out=True,
+                 fuse_out=False,
+                 refine_type=None,
+                 dropout_ratio=0,
+                 aligned=True,
+                 align_corners=True,
+                 padding_mode='zeros',
+                 lateral_conv=True):
+        super(PointFusion, self).__init__()
+        if isinstance(img_levels, int):
+            img_levels = [img_levels]
+        if isinstance(img_channels, int):
+            img_channels = [img_channels] * len(img_levels)
+        assert isinstance(img_levels, list)
+        assert isinstance(img_channels, list)
+        assert len(img_channels) == len(img_levels)
+
+        self.img_levels = img_levels
+        self.activation = activation
+        self.activate_out = activate_out
+        self.fuse_out = fuse_out
+        self.refine_type = refine_type
+        self.dropout_ratio = dropout_ratio
+        self.img_channels = img_channels
+        self.aligned = aligned
+        self.align_corners = align_corners
+        self.padding_mode = padding_mode
+
+        self.lateral_convs = None
+        if lateral_conv:
+            self.lateral_convs = nn.ModuleList()
+            for i in range(len(img_channels)):
+                l_conv = ConvModule(
+                    img_channels[i],
+                    mid_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    activation=self.activation,
+                    inplace=False)
+                self.lateral_convs.append(l_conv)
+            self.img_transform = nn.Sequential(
+                nn.Linear(mid_channels * len(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        else:
+            self.img_transform = nn.Sequential(
+                nn.Linear(sum(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        self.pts_transform = nn.Sequential(
+            nn.Linear(pts_channels, out_channels),
+            nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+        )
+
+        if self.fuse_out:
+            self.fuse_conv = nn.Sequential(
+                nn.Linear(mid_channels, out_channels),
+                # For pts the BN is initialized differently by default
+                # TODO: check whether this is necessary
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+                nn.ReLU(inplace=False))
+
+        if self.refine_type == 'non_local':
+            self.refine = NonLocal2D(
+                out_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+        self.init_weights()
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear)):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, img_feats, pts, pts_feats, img_meta):
+        """
+        img_feats (List[Tensor]): img features
+        pts: [List[Tensor]]: a batch of points with shape Nx3
+        pts_feats (Tensor): a tensor consist of point features of the
+            total batch
+
+        """
+        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_meta)
+        img_pre_fuse = self.img_transform(img_pts)
+        if self.training and self.dropout_ratio > 0:
+            img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio)
+        pts_pre_fuse = self.pts_transform(pts_feats)
+
+        fuse_out = img_pre_fuse + pts_pre_fuse
+        if self.activate_out:
+            fuse_out = F.relu(fuse_out)
+        if self.fuse_out:
+            fuse_out = self.fuse_conv(fuse_out)
+
+        if self.refine_type is not None:
+            fuse_out_T = fuse_out.t()[None, ..., None]  # NxC -> 1xCxNx1
+            batch_idx = 0
+            attentive = []
+            for i in range(len(pts)):
+                end_idx = batch_idx + len(pts[i])
+                attentive.append(
+                    self.refine(fuse_out_T[:, :, batch_idx:end_idx]))
+                batch_idx = end_idx
+            fuse_out = torch.cat(attentive, dim=-2).squeeze().t()
+        return fuse_out
+
+    def obtain_mlvl_feats(self, img_feats, pts, img_meta):
+        if self.lateral_convs is not None:
+            img_ins = [
+                lateral_conv(img_feats[i])
+                for i, lateral_conv in zip(self.img_levels, self.lateral_convs)
+            ]
+        else:
+            img_ins = img_feats
+        img_feats_per_point = []
+        # Sample multi-level features
+        for i in range(len(img_meta)):
+            mlvl_img_feats = []
+            for level in range(len(self.img_levels)):
+                if torch.isnan(img_ins[level][i:i + 1]).any():
+                    import pdb
+                    pdb.set_trace()
+                mlvl_img_feats.append(
+                    self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3],
+                                       img_meta[i]))
+            mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1)
+            img_feats_per_point.append(mlvl_img_feats)
+
+        img_pts = torch.cat(img_feats_per_point, dim=0)
+        return img_pts
+
+    def sample_single(self, img_feats, pts, img_meta):
+        pcd_scale_factor = (
+            img_meta['pcd_scale_factor']
+            if 'pcd_scale_factor' in img_meta.keys() else 1)
+        pcd_trans_factor = (
+            pts.new_tensor(img_meta['pcd_trans'])
+            if 'pcd_trans' in img_meta.keys() else 0)
+        pcd_rotate_mat = (
+            pts.new_tensor(img_meta['pcd_rotation'])
+            if 'pcd_rotation' in img_meta.keys() else
+            torch.eye(3).type_as(pts).to(pts.device))
+        img_scale_factor = (
+            img_meta['scale_factor']
+            if 'scale_factor' in img_meta.keys() else 1)
+        pcd_flip = img_meta['pcd_flip'] if 'pcd_flip' in img_meta.keys(
+        ) else False
+        img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
+        img_crop_offset = (
+            pts.new_tensor(img_meta['img_crop_offset'])
+            if 'img_crop_offset' in img_meta.keys() else 0)
+        img_pts = point_sample(
+            img_feats,
+            pts,
+            pts.new_tensor(img_meta['lidar2img']),
+            pcd_rotate_mat,
+            img_scale_factor,
+            img_crop_offset,
+            pcd_trans_factor,
+            pcd_scale_factor,
+            pcd_flip=pcd_flip,
+            img_flip=img_flip,
+            img_pad_shape=img_meta['pad_shape'][:2],
+            img_shape=img_meta['img_shape'][:2],
+            aligned=self.aligned,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners,
+        )
+        return img_pts
--- a/mmdet3d/models/losses/__init__.py
+++ b/mmdet3d/models/losses/__init__.py
+from mmdet.models.losses import FocalLoss, SmoothL1Loss
+
+__all__ = ['FocalLoss', 'SmoothL1Loss']
--- a/mmdet3d/models/middle_encoders/__init__.py
+++ b/mmdet3d/models/middle_encoders/__init__.py
+from .pillar_scatter import PointPillarsScatter
+from .sparse_encoder import SparseEncoder
+
+__all__ = ['PointPillarsScatter', 'SparseEncoder']
--- a/mmdet3d/models/middle_encoders/pillar_scatter.py
+++ b/mmdet3d/models/middle_encoders/pillar_scatter.py
+import torch
+from torch import nn
+
+from ..registry import MIDDLE_ENCODERS
+
+
+@MIDDLE_ENCODERS.register_module
+class PointPillarsScatter(nn.Module):
+
+    def __init__(self, in_channels, output_shape):
+        """
+        Point Pillar's Scatter.
+        Converts learned features from dense tensor to sparse pseudo image.
+
+        Args:
+            output_shape (list[int]): Required output shape of features.
+            in_channels (int): Number of input features.
+        """
+
+        super().__init__()
+        self.name = 'PointPillarsScatter'
+        self.output_shape = output_shape
+        self.ny = output_shape[0]
+        self.nx = output_shape[1]
+        self.nchannels = in_channels
+
+    def forward(self, voxel_features, coors, batch_size=None):
+        # TODO: rewrite the function in a batch manner
+        # no need to deal with different batch cases
+        if batch_size is not None:
+            return self.forward_batch(voxel_features, coors, batch_size)
+        else:
+            return self.forward_single(voxel_features, coors)
+
+    def forward_single(self, voxel_features, coors):
+        # Create the canvas for this sample
+        canvas = torch.zeros(
+            self.nchannels,
+            self.nx * self.ny,
+            dtype=voxel_features.dtype,
+            device=voxel_features.device)
+
+        indices = coors[:, 1] * self.nx + coors[:, 2]
+        indices = indices.long()
+        voxels = voxel_features.t()
+        # Now scatter the blob back to the canvas.
+        canvas[:, indices] = voxels
+        # Undo the column stacking to final 4-dim tensor
+        canvas = canvas.view(1, self.nchannels, self.ny, self.nx)
+        return [canvas]
+
+    def forward_batch(self, voxel_features, coors, batch_size):
+
+        # batch_canvas will be the final output.
+        batch_canvas = []
+        for batch_itt in range(batch_size):
+            # Create the canvas for this sample
+            canvas = torch.zeros(
+                self.nchannels,
+                self.nx * self.ny,
+                dtype=voxel_features.dtype,
+                device=voxel_features.device)
+
+            # Only include non-empty pillars
+            batch_mask = coors[:, 0] == batch_itt
+            this_coors = coors[batch_mask, :]
+            indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
+            indices = indices.type(torch.long)
+            voxels = voxel_features[batch_mask, :]
+            voxels = voxels.t()
+
+            # Now scatter the blob back to the canvas.
+            canvas[:, indices] = voxels
+
+            # Append to a list for later stacking.
+            batch_canvas.append(canvas)
+
+        # Stack to 3-dim tensor (batch-size, nchannels, nrows*ncols)
+        batch_canvas = torch.stack(batch_canvas, 0)
+
+        # Undo the column stacking to final 4-dim tensor
+        batch_canvas = batch_canvas.view(batch_size, self.nchannels, self.ny,
+                                         self.nx)
+
+        return batch_canvas
--- a/mmdet3d/models/middle_encoders/sparse_encoder.py
+++ b/mmdet3d/models/middle_encoders/sparse_encoder.py
+import torch.nn as nn
+
+import mmdet3d.ops.spconv as spconv
+from ..registry import MIDDLE_ENCODERS
+from ..utils import build_norm_layer
+
+
+@MIDDLE_ENCODERS.register_module
+class SparseEncoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 output_shape,
+                 pre_act,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01)):
+        super().__init__()
+        self.sparse_shape = output_shape
+        self.output_shape = output_shape
+        self.in_channels = in_channels
+        self.pre_act = pre_act
+        # Spconv init all weight on its own
+        # TODO: make the network could be modified
+
+        if pre_act:
+            self.conv_input = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    16,
+                    3,
+                    padding=1,
+                    bias=False,
+                    indice_key='subm1'), )
+            block = self.pre_act_block
+        else:
+            norm_name, norm_layer = build_norm_layer(norm_cfg, 16)
+            self.conv_input = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    16,
+                    3,
+                    padding=1,
+                    bias=False,
+                    indice_key='subm1'),
+                norm_layer,
+                nn.ReLU(),
+            )
+            block = self.post_act_block
+
+        self.conv1 = spconv.SparseSequential(
+            block(16, 16, 3, norm_cfg=norm_cfg, padding=1,
+                  indice_key='subm1'), )
+
+        self.conv2 = spconv.SparseSequential(
+            # [1600, 1408, 41] -> [800, 704, 21]
+            block(
+                16,
+                32,
+                3,
+                norm_cfg=norm_cfg,
+                stride=2,
+                padding=1,
+                indice_key='spconv2',
+                conv_type='spconv'),
+            block(32, 32, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm2'),
+            block(32, 32, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm2'),
+        )
+
+        self.conv3 = spconv.SparseSequential(
+            # [800, 704, 21] -> [400, 352, 11]
+            block(
+                32,
+                64,
+                3,
+                norm_cfg=norm_cfg,
+                stride=2,
+                padding=1,
+                indice_key='spconv3',
+                conv_type='spconv'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm3'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm3'),
+        )
+
+        self.conv4 = spconv.SparseSequential(
+            # [400, 352, 11] -> [200, 176, 5]
+            block(
+                64,
+                64,
+                3,
+                norm_cfg=norm_cfg,
+                stride=2,
+                padding=(0, 1, 1),
+                indice_key='spconv4',
+                conv_type='spconv'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm4'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm4'),
+        )
+
+        norm_name, norm_layer = build_norm_layer(norm_cfg, 128)
+        self.conv_out = spconv.SparseSequential(
+            # [200, 176, 5] -> [200, 176, 2]
+            spconv.SparseConv3d(
+                128,
+                128, (3, 1, 1),
+                stride=(2, 1, 1),
+                padding=0,
+                bias=False,
+                indice_key='spconv_down2'),
+            norm_layer,
+            nn.ReLU(),
+        )
+
+    def forward(self, voxel_features, coors, batch_size):
+        """
+        :param voxel_features:  (N, C)
+        :param coors:   (N, 4)  [batch_idx, z_idx, y_idx, x_idx]
+        :param batch_size:
+        :return:
+        """
+        coors = coors.int()
+        input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
+                                                  self.sparse_shape,
+                                                  batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        x_conv1 = self.conv1(x)
+        x_conv2 = self.conv2(x_conv1)
+        x_conv3 = self.conv3(x_conv2)
+        x_conv4 = self.conv4(x_conv3)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(x_conv4)
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        return spatial_features
+
+    def pre_act_block(self,
+                      in_channels,
+                      out_channels,
+                      kernel_size,
+                      indice_key=None,
+                      stride=1,
+                      padding=0,
+                      conv_type='subm',
+                      norm_cfg=None):
+        norm_name, norm_layer = build_norm_layer(norm_cfg, in_channels)
+        if conv_type == 'subm':
+            m = spconv.SparseSequential(
+                norm_layer,
+                nn.ReLU(inplace=True),
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+            )
+        elif conv_type == 'spconv':
+            m = spconv.SparseSequential(
+                norm_layer,
+                nn.ReLU(inplace=True),
+                spconv.SparseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+            )
+        else:
+            raise NotImplementedError
+        return m
+
+    def post_act_block(self,
+                       in_channels,
+                       out_channels,
+                       kernel_size,
+                       indice_key,
+                       stride=1,
+                       padding=0,
+                       conv_type='subm',
+                       norm_cfg=None):
+        norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels)
+        if conv_type == 'subm':
+            m = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    bias=False,
+                    indice_key=indice_key),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            )
+        elif conv_type == 'spconv':
+            m = spconv.SparseSequential(
+                spconv.SparseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            )
+        else:
+            raise NotImplementedError
+        return m
--- a/mmdet3d/models/necks/__init__.py
+++ b/mmdet3d/models/necks/__init__.py
+from mmdet.models.necks.fpn import FPN
+from .second_fpn import SECONDFPN
+
+__all__ = ['FPN', 'SECONDFPN']
--- a/mmdet3d/models/necks/second_fpn.py
+++ b/mmdet3d/models/necks/second_fpn.py
+import logging
+from functools import partial
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+from torch.nn import Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from .. import builder
+from ..registry import NECKS
+from ..utils import build_norm_layer
+
+
+class Empty(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(Empty, self).__init__()
+
+    def forward(self, *args, **kwargs):
+        if len(args) == 1:
+            return args[0]
+        elif len(args) == 0:
+            return None
+        return args
+
+
+@NECKS.register_module
+class SECONDFPN(nn.Module):
+    """Compare with RPN, RPNV2 support arbitrary number of stage.
+    """
+
+    def __init__(self,
+                 use_norm=True,
+                 in_channels=[128, 128, 256],
+                 upsample_strides=[1, 2, 4],
+                 num_upsample_filters=[256, 256, 256],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01)):
+        # if for GroupNorm,
+        # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
+        super(SECONDFPN, self).__init__()
+        assert len(num_upsample_filters) == len(upsample_strides)
+        self.in_channels = in_channels
+
+        if norm_cfg is not None:
+            ConvTranspose2d = partial(nn.ConvTranspose2d, bias=False)
+        else:
+            ConvTranspose2d = partial(nn.ConvTranspose2d, bias=True)
+
+        deblocks = []
+
+        for i, num_upsample_filter in enumerate(num_upsample_filters):
+            norm_layer = (
+                build_norm_layer(norm_cfg, num_upsample_filter)[1]
+                if norm_cfg is not None else Empty)
+            deblock = Sequential(
+                ConvTranspose2d(
+                    in_channels[i],
+                    num_upsample_filter,
+                    upsample_strides[i],
+                    stride=upsample_strides[i]),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            )
+            deblocks.append(deblock)
+        self.deblocks = nn.ModuleList(deblocks)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # keeping the initiation yields better results
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+        return
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+        ups = [deblock(inputs[i]) for i, deblock in enumerate(self.deblocks)]
+
+        if len(ups) > 1:
+            x = torch.cat(ups, dim=1)
+        else:
+            x = ups[0]
+        return [x]
+
+
+@NECKS.register_module
+class SECONDFusionFPN(SECONDFPN):
+    """Compare with RPN, RPNV2 support arbitrary number of stage.
+    """
+
+    def __init__(self,
+                 use_norm=True,
+                 in_channels=[128, 128, 256],
+                 upsample_strides=[1, 2, 4],
+                 num_upsample_filters=[256, 256, 256],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 down_sample_rate=[40, 8, 8],
+                 fusion_layer=None,
+                 cat_points=False):
+        super(SECONDFusionFPN, self).__init__(
+            use_norm,
+            in_channels,
+            upsample_strides,
+            num_upsample_filters,
+            norm_cfg,
+        )
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+        self.cat_points = cat_points
+        self.down_sample_rate = down_sample_rate
+
+    def forward(self,
+                inputs,
+                coors=None,
+                points=None,
+                img_feats=None,
+                img_meta=None):
+        assert len(inputs) == len(self.in_channels)
+        ups = [deblock(inputs[i]) for i, deblock in enumerate(self.deblocks)]
+
+        if len(ups) > 1:
+            x = torch.cat(ups, dim=1)
+        else:
+            x = ups[0]
+        if (self.fusion_layer is not None and img_feats is not None):
+            downsample_pts_coors = torch.zeros_like(coors)
+            downsample_pts_coors[:, 0] = coors[:, 0]
+            downsample_pts_coors[:, 1] = (
+                coors[:, 1] / self.down_sample_rate[0])
+            downsample_pts_coors[:, 2] = (
+                coors[:, 2] / self.down_sample_rate[1])
+            downsample_pts_coors[:, 3] = (
+                coors[:, 3] / self.down_sample_rate[2])
+            # fusion for each point
+            x = self.fusion_layer(img_feats, points, x, downsample_pts_coors,
+                                  img_meta)
+        return [x]
--- a/mmdet3d/models/registry.py
+++ b/mmdet3d/models/registry.py
+from mmdet.utils import Registry
+
+VOXEL_ENCODERS = Registry('voxel_encoder')
+MIDDLE_ENCODERS = Registry('middle_encoder')
+FUSION_LAYERS = Registry('fusion_layer')
--- a/mmdet3d/models/roi_extractors/__init__.py
+++ b/mmdet3d/models/roi_extractors/__init__.py
+from mmdet.models.roi_extractors.single_level import SingleRoIExtractor
+
+__all__ = ['SingleRoIExtractor']
--- a/mmdet3d/models/utils/__init__.py
+++ b/mmdet3d/models/utils/__init__.py
+from mmdet.models.utils import ResLayer, bias_init_with_prob
+
+__all__ = ['bias_init_with_prob', 'ResLayer']
--- a/mmdet3d/models/utils/weight_init.py
+++ b/mmdet3d/models/utils/weight_init.py
+import numpy as np
+import torch.nn as nn
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def bias_init_with_prob(prior_prob):
+    """ initialize conv/fc bias value according to giving probablity"""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
--- a/mmdet3d/models/voxel_encoders/__init__.py
+++ b/mmdet3d/models/voxel_encoders/__init__.py
+from .pillar_encoder import AlignedPillarFeatureNet, PillarFeatureNet
+from .voxel_encoder import (DynamicVFE, VoxelFeatureExtractor,
+                            VoxelFeatureExtractorV2, VoxelFeatureExtractorV3)
+
+__all__ = [
+    'PillarFeatureNet', 'AlignedPillarFeatureNet', 'VoxelFeatureExtractor',
+    'DynamicVFE', 'VoxelFeatureExtractorV2', 'VoxelFeatureExtractorV3'
+]
--- a/mmdet3d/models/voxel_encoders/pillar_encoder.py
+++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py
+import torch
+from torch import nn
+
+from mmdet3d.ops import DynamicScatter, build_norm_layer
+from ..registry import VOXEL_ENCODERS
+from .utils import PFNLayer, get_paddings_indicator
+
+
+@VOXEL_ENCODERS.register_module
+class PillarFeatureNet(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 mode='max'):
+        """ Pillar Feature Net.
+        The network prepares the pillar features and performs forward pass
+        through PFNLayers.
+
+        Args:
+            num_input_features (int). Number of input features,
+                either x, y, z or x, y, z, r.
+            use_norm (bool). Whether to include BatchNorm.
+            num_filters (list[int]). Number of features in each of the
+                N PFNLayers.
+            with_distance (bool). Whether to include Euclidean distance
+                to points.
+            voxel_size (list[float]). Size of voxels, only utilize x and y
+                size.
+            point_cloud_range (list[float>]). Point cloud range, only
+                utilize x and y min.
+        """
+
+        super(PillarFeatureNet, self).__init__()
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            num_input_features += 3
+        if with_voxel_center:
+            num_input_features += 2
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+
+        # Create PillarFeatureNet layers
+        self.num_input_features = num_input_features
+        num_filters = [num_input_features] + list(num_filters)
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i < len(num_filters) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(
+                PFNLayer(
+                    in_filters,
+                    out_filters,
+                    use_norm,
+                    last_layer=last_layer,
+                    mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.point_cloud_range = point_cloud_range
+
+    def forward(self, features, num_points, coors):
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(
+                dim=1, keepdim=True) / num_points.type_as(features).view(
+                    -1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features[:, :, :2]
+            f_center[:, :, 0] = f_center[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = f_center[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        for pfn in self.pfn_layers:
+            features = pfn(features, num_points)
+
+        return features.squeeze()
+
+
+@VOXEL_ENCODERS.register_module
+class DynamicPillarFeatureNet(PillarFeatureNet):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max'):
+        """
+        Dynamic Pillar Feature Net for Dynamic Voxelization.
+        The difference is in the forward part
+        """
+
+        super(DynamicPillarFeatureNet, self).__init__(
+            num_input_features,
+            use_norm,
+            num_filters,
+            with_distance,
+            with_cluster_center=with_cluster_center,
+            with_voxel_center=with_voxel_center,
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            mode=mode)
+
+        num_filters = [self.num_input_features] + list(num_filters)
+        pfn_layers = []
+        # TODO: currently only support one PFNLayer
+
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            pfn_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+        self.num_pfn = len(pfn_layers)
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+        self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+
+    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[:, indices.long()] = voxel_mean.t()
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        center_per_point = canvas[:, voxel_index.long()].t()
+        return center_per_point
+
+    def forward(self, features, coors):
+        """
+        features (torch.Tensor): NxC
+        coors (torch.Tensor): Nx(1+NDim)
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 2))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, pfn in enumerate(self.pfn_layers):
+            point_feats = pfn(features)
+            voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
+            if i != len(self.pfn_layers) - 1:
+                # need to concat voxel feats if it is not the last pfn
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        return voxel_feats, voxel_coors
+
+
+@VOXEL_ENCODERS.register_module
+class AlignedPillarFeatureNet(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 mode='max'):
+        """ Pillar Feature Net.
+
+        The network prepares the pillar features and performs forward pass
+        through PFNLayers.
+
+        Args:
+            num_input_features (int): Number of input features, either x, y, z
+                or x, y, z, r.
+            use_norm (bool): Whether to include BatchNorm.
+            num_filters (list[int]): Number of features in each of the N
+                PFNLayers.
+            with_distance (bool): Whether to include Euclidean distance to
+                points.
+            voxel_size (list[float]): Size of voxels, only utilize x and y
+                size.
+            point_cloud_range: (list[float]): Point cloud range, only
+                utilize x and y min.
+        """
+
+        super(AlignedPillarFeatureNet, self).__init__()
+
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            print('Use cluster center')
+            num_input_features += 3
+        if with_voxel_center:
+            print('Use voxel center')
+            num_input_features += 2
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+
+        # Create PillarFeatureNet layers
+        num_filters = [num_input_features] + list(num_filters)
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i < len(num_filters) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(
+                PFNLayer(
+                    in_filters,
+                    out_filters,
+                    use_norm,
+                    last_layer=last_layer,
+                    mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to
+        # calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+
+    def forward(self, features, num_points, coors):
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(
+                dim=1, keepdim=True) / num_points.type_as(features).view(
+                    -1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        x_distance = features[:, :, 0] - (
+            coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+            self.x_offset)
+        y_distance = features[:, :, 1] - (
+            coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+            self.y_offset)
+        z_distance = features[:, :, 2] - (
+            coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+            self.z_offset)
+
+        normed_x_distance = 1 - torch.abs(x_distance / self.vx)
+        normed_y_distance = 1 - torch.abs(y_distance / self.vy)
+        normed_z_distance = 1 - torch.abs(z_distance / self.vz)
+
+        x_mask = torch.gt(normed_x_distance, 0).type_as(features)
+        y_mask = torch.gt(normed_y_distance, 0).type_as(features)
+        z_mask = torch.gt(normed_z_distance, 0).type_as(features)
+
+        nonzero_points_mask = x_mask.mul(y_mask).mul(z_mask)
+        aligned_distance = normed_x_distance.mul(normed_y_distance).mul(
+            normed_z_distance).mul(nonzero_points_mask)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features[:, :, :2]
+            f_center[:, :, 0] = f_center[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = f_center[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+
+        # The feature decorations were calculated without regard to
+        # whether pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        for pfn in self.pfn_layers:
+            if pfn.last_vfe:
+                features = pfn(features, aligned_distance)
+            else:
+                features = pfn(features)
+
+        return features.squeeze()
--- a/mmdet3d/models/voxel_encoders/utils.py
+++ b/mmdet3d/models/voxel_encoders/utils.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..utils import build_norm_layer
+
+
+class Empty(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(Empty, self).__init__()
+
+    def forward(self, *args, **kwargs):
+        if len(args) == 1:
+            return args[0]
+        elif len(args) == 0:
+            return None
+        return args
+
+
+def get_paddings_indicator(actual_num, max_num, axis=0):
+    """Create boolean mask by actually number of a padded tensor.
+
+    Args:
+        actual_num ([type]): [description]
+        max_num ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    actual_num = torch.unsqueeze(actual_num, axis + 1)
+    # tiled_actual_num: [N, M, 1]
+    max_num_shape = [1] * len(actual_num.shape)
+    max_num_shape[axis + 1] = -1
+    max_num = torch.arange(
+        max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
+    # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
+    # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
+    paddings_indicator = actual_num.int() > max_num
+    # paddings_indicator shape: [batch_size, max_num]
+    return paddings_indicator
+
+
+class VFELayer(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 max_out=True,
+                 cat_max=True):
+        super(VFELayer, self).__init__()
+        self.cat_max = cat_max
+        self.max_out = max_out
+        # self.units = int(out_channels / 2)
+        if norm_cfg:
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels)
+            self.norm = norm_layer
+            self.linear = nn.Linear(in_channels, out_channels, bias=False)
+        else:
+            self.norm = Empty(out_channels)
+            self.linear = nn.Linear(in_channels, out_channels, bias=True)
+
+    def forward(self, inputs):
+        # [K, T, 7] tensordot [7, units] = [K, T, units]
+        voxel_count = inputs.shape[1]
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        pointwise = F.relu(x)
+        # [K, T, units]
+        if self.max_out:
+            aggregated = torch.max(pointwise, dim=1, keepdim=True)[0]
+        else:
+            # this is for fusion layer
+            return pointwise
+
+        if not self.cat_max:
+            return aggregated.squeeze(1)
+        else:
+            # [K, 1, units]
+            repeated = aggregated.repeat(1, voxel_count, 1)
+            concatenated = torch.cat([pointwise, repeated], dim=2)
+            # [K, T, 2 * units]
+            return concatenated
+
+
+class PFNLayer(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 use_norm=True,
+                 last_layer=False,
+                 mode='max'):
+        """ Pillar Feature Net Layer.
+
+        The Pillar Feature Net is composed of a series of these layers, but the
+        PointPillars paper results only used a single PFNLayer.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            use_norm (bool): Whether to include BatchNorm.
+            last_layer (bool): If last_layer, there is no concatenation of
+                features.
+        """
+
+        super().__init__()
+        self.name = 'PFNLayer'
+        self.last_vfe = last_layer
+        if not self.last_vfe:
+            out_channels = out_channels // 2
+        self.units = out_channels
+
+        if use_norm:
+            self.norm = nn.BatchNorm1d(self.units, eps=1e-3, momentum=0.01)
+            self.linear = nn.Linear(in_channels, self.units, bias=False)
+        else:
+            self.norm = Empty(self.unints)
+            self.linear = nn.Linear(in_channels, self.units, bias=True)
+
+        self.mode = mode
+
+    def forward(self, inputs, num_voxels=None, aligned_distance=None):
+
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        x = F.relu(x)
+
+        if self.mode == 'max':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = torch.max(x, dim=1, keepdim=True)[0]
+        elif self.mode == 'avg':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = x.sum(
+                dim=1, keepdim=True) / num_voxels.type_as(inputs).view(
+                    -1, 1, 1)
+
+        if self.last_vfe:
+            return x_max
+        else:
+            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
+            x_concatenated = torch.cat([x, x_repeat], dim=2)
+            return x_concatenated
--- a/mmdet3d/models/voxel_encoders/voxel_encoder.py
+++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from mmdet3d.ops import DynamicScatter
+from .. import builder
+from ..registry import VOXEL_ENCODERS
+from ..utils import build_norm_layer
+from .utils import Empty, VFELayer, get_paddings_indicator
+
+
+@VOXEL_ENCODERS.register_module
+class VoxelFeatureExtractor(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=[32, 128],
+                 with_distance=False,
+                 name='VoxelFeatureExtractor'):
+        super(VoxelFeatureExtractor, self).__init__()
+        self.name = name
+        assert len(num_filters) == 2
+        num_input_features += 3  # add mean features
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+        self.vfe1 = VFELayer(num_input_features, num_filters[0], use_norm)
+        self.vfe2 = VFELayer(num_filters[0], num_filters[1], use_norm)
+
+        if use_norm:
+            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=False)
+            self.norm = nn.BatchNorm1d(num_filters[1], eps=1e-3, momentum=0.01)
+        else:
+            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=True)
+            self.norm = Empty(num_filters[1])
+
+    def forward(self, features, num_voxels, **kwargs):
+        # features: [concated_num_points, num_voxel_size, 3(4)]
+        # num_voxels: [concated_num_points]
+        # t = time.time()
+        # torch.cuda.synchronize()
+
+        points_mean = features[:, :, :3].sum(
+            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
+        features_relative = features[:, :, :3] - points_mean
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features = torch.cat([features, features_relative, points_dist],
+                                 dim=-1)
+        else:
+            features = torch.cat([features, features_relative], dim=-1)
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        # mask = features.max(dim=2, keepdim=True)[0] != 0
+
+        # torch.cuda.synchronize()
+        # print("vfe prep forward time", time.time() - t)
+        x = self.vfe1(features)
+        x *= mask
+        x = self.vfe2(x)
+        x *= mask
+        x = self.linear(x)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        x = F.relu(x)
+        x *= mask
+        # x: [concated_num_points, num_voxel_size, 128]
+        voxelwise = torch.max(x, dim=1)[0]
+        return voxelwise
+
+
+@VOXEL_ENCODERS.register_module
+class VoxelFeatureExtractorV2(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=[32, 128],
+                 with_distance=False,
+                 name='VoxelFeatureExtractor'):
+        super(VoxelFeatureExtractorV2, self).__init__()
+        self.name = name
+        assert len(num_filters) > 0
+        num_input_features += 3
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+
+        num_filters = [num_input_features] + num_filters
+        filters_pairs = [[num_filters[i], num_filters[i + 1]]
+                         for i in range(len(num_filters) - 1)]
+        self.vfe_layers = nn.ModuleList(
+            [VFELayer(i, o, use_norm) for i, o in filters_pairs])
+
+        if use_norm:
+            self.linear = nn.Linear(
+                num_filters[-1], num_filters[-1], bias=False)
+            self.norm = nn.BatchNorm1d(
+                num_filters[-1], eps=1e-3, momentum=0.01)
+        else:
+            self.linear = nn.Linear(
+                num_filters[-1], num_filters[-1], bias=True)
+            self.norm = Empty(num_filters[-1])
+
+    def forward(self, features, num_voxels, **kwargs):
+        # features: [concated_num_points, num_voxel_size, 3(4)]
+        # num_voxels: [concated_num_points]
+        points_mean = features[:, :, :3].sum(
+            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
+        features_relative = features[:, :, :3] - points_mean
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features = torch.cat([features, features_relative, points_dist],
+                                 dim=-1)
+        else:
+            features = torch.cat([features, features_relative], dim=-1)
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        for vfe in self.vfe_layers:
+            features = vfe(features)
+            features *= mask
+        features = self.linear(features)
+        features = self.norm(features.permute(0, 2, 1).contiguous()).permute(
+            0, 2, 1).contiguous()
+        features = F.relu(features)
+        features *= mask
+        # x: [concated_num_points, num_voxel_size, 128]
+        voxelwise = torch.max(features, dim=1)[0]
+        return voxelwise
+
+
+@VOXEL_ENCODERS.register_module
+class VoxelFeatureExtractorV3(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=[32, 128],
+                 with_distance=False,
+                 name='VoxelFeatureExtractor'):
+        super(VoxelFeatureExtractorV3, self).__init__()
+        self.name = name
+
+    def forward(self, features, num_points, coors):
+        # features: [concated_num_points, num_voxel_size, 3(4)]
+        # num_points: [concated_num_points]
+        points_mean = features[:, :, :4].sum(
+            dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
+        return points_mean.contiguous()
+
+
+@VOXEL_ENCODERS.register_module
+class DynamicVFEV3(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
+        super(DynamicVFEV3, self).__init__()
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+    @torch.no_grad()
+    def forward(self, features, coors):
+        # This function is used from the start of the voxelnet
+        # num_points: [concated_num_points]
+        features, features_coors = self.scatter(features, coors)
+        return features, features_coors
+
+
+@VOXEL_ENCODERS.register_module
+class DynamicVFE(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 num_filters=[],
+                 with_distance=False,
+                 with_cluster_center=False,
+                 with_voxel_center=False,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 fusion_layer=None,
+                 return_point_feats=False):
+        super(DynamicVFE, self).__init__()
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            num_input_features += 3
+        if with_voxel_center:
+            num_input_features += 3
+        if with_distance:
+            num_input_features += 3
+        self.num_input_features = num_input_features
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+        num_filters = [self.num_input_features] + list(num_filters)
+        vfe_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            vfe_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+            self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+        self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+
+    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_z = int(
+            (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz)
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        # canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_z * canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            voxel_coors[:, 1] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[indices.long()] = torch.arange(
+            start=0, end=voxel_mean.size(0), device=voxel_mean.device)
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            pts_coors[:, 1] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        voxel_inds = canvas[voxel_index.long()]
+        center_per_point = voxel_mean[voxel_inds, ...]
+        return center_per_point
+
+    def forward(self,
+                features,
+                coors,
+                points=None,
+                img_feats=None,
+                img_meta=None):
+        """
+        features (torch.Tensor): NxC
+        coors (torch.Tensor): Nx(1+NDim)
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 1].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, vfe in enumerate(self.vfe_layers):
+            point_feats = vfe(features)
+            if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None
+                    and img_feats is not None):
+                point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                                img_meta)
+            voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors)
+            if i != len(self.vfe_layers) - 1:
+                # need to concat voxel feats if it is not the last vfe
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        if self.return_point_feats:
+            return point_feats
+        return voxel_feats, voxel_coors
+
+
+@VOXEL_ENCODERS.register_module
+class HardVFE(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 num_filters=[],
+                 with_distance=False,
+                 with_cluster_center=False,
+                 with_voxel_center=False,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 fusion_layer=None,
+                 return_point_feats=False):
+        super(HardVFE, self).__init__()
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            num_input_features += 3
+        if with_voxel_center:
+            num_input_features += 3
+        if with_distance:
+            num_input_features += 3
+        self.num_input_features = num_input_features
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        # Need pillar (voxel) size and x/y offset to calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+        num_filters = [self.num_input_features] + list(num_filters)
+        vfe_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i > 0:
+                in_filters *= 2
+            # TODO: pass norm_cfg to VFE
+            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            if i == (len(num_filters) - 2):
+                cat_max = False
+                max_out = True
+                if fusion_layer:
+                    max_out = False
+            else:
+                max_out = True
+                cat_max = True
+            vfe_layers.append(
+                VFELayer(
+                    in_filters,
+                    out_filters,
+                    norm_cfg=norm_cfg,
+                    max_out=max_out,
+                    cat_max=cat_max))
+            self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+
+    def forward(self,
+                features,
+                num_points,
+                coors,
+                img_feats=None,
+                img_meta=None):
+        """
+        features (torch.Tensor): NxMxC
+        coors (torch.Tensor): Nx(1+NDim)
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = (
+                features[:, :, :3].sum(dim=1, keepdim=True) /
+                num_points.type_as(features).view(-1, 1, 1))
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(
+                size=(features.size(0), features.size(1), 3))
+            f_center[:, :, 0] = features[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = features[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            f_center[:, :, 2] = features[:, :, 2] - (
+                coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+                self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        voxel_feats = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty.
+        # Need to ensure that empty voxels remain set to zeros.
+        voxel_count = voxel_feats.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats)
+
+        for i, vfe in enumerate(self.vfe_layers):
+            voxel_feats = vfe(voxel_feats)
+        if torch.isnan(voxel_feats).any():
+            import pdb
+            pdb.set_trace()
+        if (self.fusion_layer is not None and img_feats is not None):
+            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
+                                                coors, img_feats, img_meta)
+        if torch.isnan(voxel_feats).any():
+            import pdb
+            pdb.set_trace()
+        return voxel_feats
+
+    def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
+                         img_meta):
+        # the features is consist of a batch of points
+        batch_size = coors[-1, 0] + 1
+        points = []
+        for i in range(batch_size):
+            single_mask = (coors[:, 0] == i)
+            points.append(features[single_mask][mask[single_mask]])
+
+        point_feats = voxel_feats[mask]
+        if torch.isnan(point_feats).any():
+            import pdb
+            pdb.set_trace()
+        point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                        img_meta)
+        if torch.isnan(point_feats).any():
+            import pdb
+            pdb.set_trace()
+        voxel_canvas = voxel_feats.new_zeros(
+            size=(voxel_feats.size(0), voxel_feats.size(1),
+                  point_feats.size(-1)))
+        voxel_canvas[mask] = point_feats
+        out = torch.max(voxel_canvas, dim=1)[0]
+        if torch.isnan(out).any():
+            import pdb
+            pdb.set_trace()
+        return out
--- a/mmdet3d/ops/__init__.py
+++ b/mmdet3d/ops/__init__.py
+from mmdet.ops import (RoIAlign, SigmoidFocalLoss, build_norm_layer,
+                       get_compiler_version, get_compiling_cuda_version, nms,
+                       roi_align, sigmoid_focal_loss)
+from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization
+
+__all__ = [
+    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version',
+    'get_compiling_cuda_version', 'build_conv_layer', 'build_norm_layer',
+    'batched_nms', 'Voxelization', 'voxelization', 'dynamic_scatter',
+    'DynamicScatter', 'sigmoid_focal_loss', 'SigmoidFocalLoss'
+]
--- a/mmdet3d/ops/iou3d/__init__.py
+++ b/mmdet3d/ops/iou3d/__init__.py
+from .iou3d_utils import (boxes_iou3d_gpu, boxes_iou_bev, nms_gpu,
+                          nms_normal_gpu)
+
+__all__ = ['boxes_iou_bev', 'boxes_iou3d_gpu', 'nms_gpu', 'nms_normal_gpu']
--- a/mmdet3d/ops/iou3d/iou3d_utils.py
+++ b/mmdet3d/ops/iou3d/iou3d_utils.py
+import torch
+
+from . import iou3d_cuda
+
+
+def boxes_iou_bev(boxes_a, boxes_b):
+    """
+    :param boxes_a: (M, 5)
+    :param boxes_b: (N, 5)
+    :return:
+        ans_iou: (M, N)
+    """
+
+    ans_iou = torch.cuda.FloatTensor(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_()
+
+    iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(),
+                                 ans_iou)
+
+    return ans_iou
+
+
+def boxes_iou3d_gpu(boxes_a, boxes_b, mode='iou'):
+    """
+    :param boxes_a: (N, 7) [x, y, z, h, w, l, ry]
+    :param boxes_b: (M, 7) [x, y, z, h, w, l, ry]
+    :param mode  "iou" (intersection over union) or iof (intersection over
+            foreground).
+    :return:
+        ans_iou: (M, N)
+    """
+    boxes_a_bev = boxes3d_to_bev_torch(boxes_a)
+    boxes_b_bev = boxes3d_to_bev_torch(boxes_b)
+
+    # bev overlap
+    overlaps_bev = torch.cuda.FloatTensor(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_()  # (N, M)
+    iou3d_cuda.boxes_overlap_bev_gpu(boxes_a_bev.contiguous(),
+                                     boxes_b_bev.contiguous(), overlaps_bev)
+
+    # height overlap
+    boxes_a_height_min = (boxes_a[:, 1] - boxes_a[:, 3]).view(-1, 1)
+    boxes_a_height_max = boxes_a[:, 1].view(-1, 1)
+    boxes_b_height_min = (boxes_b[:, 1] - boxes_b[:, 3]).view(1, -1)
+    boxes_b_height_max = boxes_b[:, 1].view(1, -1)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+
+    # 3d iou
+    overlaps_3d = overlaps_bev * overlaps_h
+
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+
+    if mode == 'iou':
+        # the clamp func is used to avoid division of 0
+        iou3d = overlaps_3d / torch.clamp(
+            vol_a + vol_b - overlaps_3d, min=1e-8)
+    else:
+        iou3d = overlaps_3d / torch.clamp(vol_a, min=1e-8)
+
+    return iou3d
+
+
+def nms_gpu(boxes, scores, thresh):
+    """
+    :param boxes: (N, 5) [x1, y1, x2, y2, ry]
+    :param scores: (N)
+    :param thresh:
+    :return:
+    """
+    # areas = (x2 - x1) * (y2 - y1)
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.LongTensor(boxes.size(0))
+    num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh)
+    return order[keep[:num_out].cuda()].contiguous()
+
+
+def nms_normal_gpu(boxes, scores, thresh):
+    """
+    :param boxes: (N, 5) [x1, y1, x2, y2, ry]
+    :param scores: (N)
+    :param thresh:
+    :return:
+    """
+    # areas = (x2 - x1) * (y2 - y1)
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.LongTensor(boxes.size(0))
+    num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh)
+    return order[keep[:num_out].cuda()].contiguous()
+
+
+def boxes3d_to_bev_torch(boxes3d):
+    """
+    :param boxes3d: (N, 7) [x, y, z, h, w, l, ry] in camera coords
+    :return:
+        boxes_bev: (N, 5) [x1, y1, x2, y2, ry]
+    """
+    boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5)))
+    cu, cv = boxes3d[:, 0], boxes3d[:, 2]
+    half_l, half_w = boxes3d[:, 5] / 2, boxes3d[:, 4] / 2
+    boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_l, cv - half_w
+    boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_l, cv + half_w
+    boxes_bev[:, 4] = boxes3d[:, 6]
+    return boxes_bev
--- a/mmdet3d/ops/iou3d/setup.py
+++ b/mmdet3d/ops/iou3d/setup.py
+from setuptools import setup
+
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='iou3d',
+    ext_modules=[
+        CUDAExtension(
+            'iou3d_cuda', [
+                'src/iou3d.cpp',
+                'src/iou3d_kernel.cu',
+            ],
+            extra_compile_args={
+                'cxx': ['-g', '-I /usr/local/cuda/include'],
+                'nvcc': ['-O2']
+            })
+    ],
+    cmdclass={'build_ext': BuildExtension})