Votenet

f717eb62 · wuyuefeng · zhangwenwei · ac3590a1 · f717eb62 · f717eb62
Commit f717eb62 authored Jun 04, 2020 by wuyuefeng Committed by zhangwenwei Jun 04, 2020
20 changed files
--- a/configs/scannet/votenet_scannet-3d-18class.py
+++ b/configs/scannet/votenet_scannet-3d-18class.py
+# model settings
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        pool_mod='max'),
+    bbox_head=dict(
+        type='VoteHead',
+        num_classes=18,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        vote_moudule_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(128, 128),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+test_cfg = dict(
+    sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39)),
+    dict(type='IndoorPointSample', num_points=40000),
+    dict(type='IndoorFlipData', flip_ratio_yz=0.5, flip_ratio_xz=0.5),
+    dict(
+        type='IndoorGlobalRotScale',
+        shift_height=True,
+        rot_range=[-1 / 36, 1 / 36],
+        scale_range=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='IndoorPointSample', num_points=40000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.008  # max learning rate
+optimizer = dict(type='Adam', lr=lr)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/votenet_scannet'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/sunrgbd/votenet_sunrgbd-3d-10class.py
+++ b/configs/sunrgbd/votenet_sunrgbd-3d-10class.py
+# model settings
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        pool_mod='max'),
+    bbox_head=dict(
+        type='VoteHead',
+        num_classes=10,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=10,
+            num_dir_bins=12,
+            with_rot=True,
+            mean_sizes=[[2.114256, 1.620300, 0.927272],
+                        [0.791118, 1.279516, 0.718182],
+                        [0.923508, 1.867419, 0.845495],
+                        [0.591958, 0.552978, 0.827272],
+                        [0.699104, 0.454178, 0.75625],
+                        [0.69519, 1.346299, 0.736364],
+                        [0.528526, 1.002642, 1.172878],
+                        [0.500618, 0.632163, 0.683424],
+                        [0.404671, 1.071108, 1.688889],
+                        [0.76584, 1.398258, 0.472728]]),
+        vote_moudule_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(128, 128),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+test_cfg = dict(
+    sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)
+
+# dataset settings
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(type='IndoorFlipData', flip_ratio_yz=0.5),
+    dict(
+        type='IndoorGlobalRotScale',
+        shift_height=True,
+        rot_range=[-1 / 6, 1 / 6],
+        scale_range=[0.85, 1.15]),
+    dict(type='IndoorPointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='IndoorPointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.008  # max learning rate
+optimizer = dict(type='Adam', lr=lr)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=30,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/votenet_sunrgbd'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -8,7 +8,9 @@ from .samplers import (BaseSampler, CombinedSampler,
                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
                       PseudoSampler, RandomSampler, SamplingResult)
 from .structures import Box3DMode, CameraInstance3DBoxes, LiDARInstance3DBoxes
-from .transforms import bbox3d2result, bbox3d2roi, boxes3d_to_bev_torch_lidar
+from .transforms import (bbox3d2result, bbox3d2roi,
+                         box3d_to_corner3d_upright_depth,
+                         boxes3d_to_bev_torch_lidar)

 from .assign_sampling import (  # isort:skip, avoid recursive imports
    build_bbox_coder,  # temporally settings
@@ -22,5 +24,6 @@ __all__ = [
    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'boxes3d_to_bev_torch_lidar',
    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
    'bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes',
-    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result'
+    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result',
+    'box3d_to_corner3d_upright_depth'
 ]
--- a/mmdet3d/core/bbox/coders/__init__.py
+++ b/mmdet3d/core/bbox/coders/__init__.py
 from mmdet.core.bbox import build_bbox_coder
 from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder

-__all__ = ['build_bbox_coder', 'DeltaXYZWLHRBBoxCoder']
+__all__ = [
+    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder'
+]
--- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class PartialBinBasedBBoxCoder(BaseBBoxCoder):
+    """Partial bin based bbox coder
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+
+    def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True):
+        super(PartialBinBasedBBoxCoder, self).__init__()
+        assert len(mean_sizes) == num_sizes
+        self.num_dir_bins = num_dir_bins
+        self.num_sizes = num_sizes
+        self.mean_sizes = mean_sizes
+        self.with_rot = with_rot
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (Tensor): 3d gt bboxes with shape (n, 7).
+            gt_labels_3d (Tensor): Gt classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d[..., 0:3]
+
+        # generate bbox size target
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d[..., 3:6] - gt_bboxes_3d.new_tensor(
+            self.mean_sizes)[size_class_target]
+
+        # generate dir target
+        box_num = gt_bboxes_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d[..., 6])
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.new_zeros(box_num)
+
+        return (center_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+
+    def decode(self, bbox_out):
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): predictions from model, should contain keys below
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+
+        Returns:
+            Tensor: decoded bbox3d with shape (batch, n, 7)
+        """
+        center = bbox_out['center']
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class'], -1)
+            dir_res = torch.gather(bbox_out['dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        size_class = torch.argmax(bbox_out['size_class'], -1, keepdim=True)
+        size_res = torch.gather(bbox_out['size_res'], 2,
+                                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+        mean_sizes = center.new_tensor(self.mean_sizes)
+        size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
+        bbox_size = size_base.reshape(batch_size, num_proposal,
+                                      -1) + size_res.squeeze(2)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def split_pred(self, preds, base_xyz):
+        """Split predicted features to specific parts.
+
+        Args:
+            preds (Tensor): predicted features to split.
+            base_xyz (Tensor): coordinates of points.
+
+        Returns:
+            dict: split results.
+        """
+        results = {}
+        start, end = 0, 0
+        preds_trans = preds.transpose(2, 1)
+
+        # decode objectness score
+        end += 2
+        results['obj_scores'] = preds_trans[..., start:end]
+        start = end
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center'] = base_xyz + preds_trans[..., start:end]
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = preds_trans[..., start:end]
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = preds_trans[..., start:end]
+        start = end
+
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
+
+        # decode size
+        end += self.num_sizes
+        results['size_class'] = preds_trans[..., start:end]
+        start = end
+
+        end += self.num_sizes * 3
+        size_res_norm = preds_trans[..., start:end]
+        batch_size, num_proposal = preds_trans.shape[:2]
+        size_res_norm = size_res_norm.view(
+            [batch_size, num_proposal, self.num_sizes, 3])
+        start = end
+
+        results['size_res_norm'] = size_res_norm
+        mean_sizes = preds.new_tensor(self.mean_sizes)
+        results['size_res'] = (
+            size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+
+        # decode semantic score
+        results['sem_scores'] = preds_trans[..., start:]
+
+        return results
+
+    def angle2class(self, angle):
+        """Convert continuous angle to a discrete class and a residual.
+
+        Convert continuous angle to a discrete class and a small
+        regression number from class center angle to current angle.
+
+        Args:
+            angle (Tensor): Angle is from 0-2pi (or -pi~pi), class center at
+                0, 1*(2pi/N), 2*(2pi/N) ...  (N-1)*(2pi/N)
+
+        Returns:
+            tuple: Encoded discrete class and residual.
+        """
+        angle = angle % (2 * np.pi)
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+        angle_cls = shifted_angle // angle_per_class
+        angle_res = shifted_angle - (
+            angle_cls * angle_per_class + angle_per_class / 2)
+        return angle_cls.long(), angle_res
+
+    def class2angle(self, angle_cls, angle_res, limit_period=True):
+        """Inverse function to angle2class
+
+        Args:
+            angle_cls (Tensor): Angle class to decode.
+            angle_res (Tensor): Angle residual to decode.
+            limit_period (bool): Whether to limit angle to [-pi, pi].
+
+        Returns:
+            Tensor: angle decoded from angle_cls and angle_res.
+        """
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        angle_center = angle_cls.float() * angle_per_class
+        angle = angle_center + angle_res
+        if limit_period:
+            angle[angle > np.pi] -= 2 * np.pi
+        return angle
--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -84,3 +84,87 @@ def bbox3d2result(bboxes, scores, labels):
    """
    return dict(
        boxes_3d=bboxes.cpu(), scores_3d=scores.cpu(), labels_3d=labels.cpu())
+
+
+def upright_depth_to_lidar_torch(points=None,
+                                 bboxes=None,
+                                 to_bottom_center=False):
+    """Convert points and boxes in upright depth coordinate to lidar.
+
+    Args:
+        points (None | Tensor): points in upright depth coordinate.
+        bboxes (None | Tensor): bboxes in upright depth coordinate.
+        to_bottom_center (bool): covert bboxes to bottom center.
+
+    Returns:
+        tuple: points and bboxes in lidar coordinate.
+    """
+    if points is not None:
+        points_lidar = points.clone()
+        points_lidar = points_lidar[..., [1, 0, 2]]
+        points_lidar[..., 1] *= -1
+    else:
+        points_lidar = None
+
+    if bboxes is not None:
+        bboxes_lidar = bboxes.clone()
+        bboxes_lidar = bboxes_lidar[..., [1, 0, 2, 4, 3, 5, 6]]
+        bboxes_lidar[..., 1] *= -1
+        if to_bottom_center:
+            bboxes_lidar[..., 2] -= 0.5 * bboxes_lidar[..., 5]
+    else:
+        bboxes_lidar = None
+
+    return points_lidar, bboxes_lidar
+
+
+def box3d_to_corner3d_upright_depth(boxes3d):
+    """Convert box3d to corner3d in upright depth coordinate
+
+    Args:
+        boxes3d (Tensor): boxes with shape [n,7] in upright depth coordinate
+
+    Returns:
+        Tensor: boxes with [n, 8, 3] in upright depth coordinate
+    """
+    boxes_num = boxes3d.shape[0]
+    ry = boxes3d[:, 6:7]
+    l, w, h = boxes3d[:, 3:4], boxes3d[:, 4:5], boxes3d[:, 5:6]
+    zeros = boxes3d.new_zeros((boxes_num, 1))
+    ones = boxes3d.new_ones((boxes_num, 1))
+    # zeros = torch.cuda.FloatTensor(boxes_num, 1).fill_(0)
+    # ones = torch.cuda.FloatTensor(boxes_num, 1).fill_(1)
+    x_corners = torch.cat(
+        [-l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2., -l / 2.],
+        dim=1)  # (N, 8)
+    y_corners = torch.cat(
+        [w / 2., w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2.],
+        dim=1)  # (N, 8)
+    z_corners = torch.cat(
+        [h / 2., h / 2., h / 2., h / 2., -h / 2., -h / 2., -h / 2., -h / 2.],
+        dim=1)  # (N, 8)
+    temp_corners = torch.cat(
+        (x_corners.unsqueeze(dim=2), y_corners.unsqueeze(dim=2),
+         z_corners.unsqueeze(dim=2)),
+        dim=2)  # (N, 8, 3)
+
+    cosa, sina = torch.cos(-ry), torch.sin(-ry)
+    raw_1 = torch.cat([cosa, -sina, zeros], dim=1)  # (N, 3)
+    raw_2 = torch.cat([sina, cosa, zeros], dim=1)  # (N, 3)
+    raw_3 = torch.cat([zeros, zeros, ones], dim=1)  # (N, 3)
+    R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1),
+                   raw_3.unsqueeze(dim=1)),
+                  dim=1)  # (N, 3, 3)
+    rotated_corners = torch.matmul(temp_corners, R)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.view(-1, 1) + x_corners.view(-1, 8)
+    y = y_loc.view(-1, 1) + y_corners.view(-1, 8)
+    z = z_loc.view(-1, 1) + z_corners.view(-1, 8)
+    corners3d = torch.cat(
+        (x.view(-1, 8, 1), y.view(-1, 8, 1), z.view(-1, 8, 1)), dim=2)
+
+    return corners3d
--- a/mmdet3d/core/post_processing/__init__.py
+++ b/mmdet3d/core/post_processing/__init__.py
 from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
                                        merge_aug_proposals, merge_aug_scores,
                                        multiclass_nms)
-from .box3d_nms import box3d_multiclass_nms
+from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms

 __all__ = [
    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
-    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms'
+    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
+    'aligned_3d_nms'
 ]
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -64,3 +64,52 @@ def box3d_multiclass_nms(mlvl_bboxes,
        labels = mlvl_scores.new_zeros((0, mlvl_scores.size(-1)))
        dir_scores = mlvl_scores.new_zeros((0, ))
    return bboxes, scores, labels, dir_scores
+
+
+def aligned_3d_nms(boxes, scores, classes, thresh):
+    """3d nms for aligned boxes.
+
+    Args:
+        boxes (Tensor): Aligned box with shape [n, 6].
+        scores (Tensor): Scores of each box.
+        classes (Tensor): Class of each box.
+        thresh (float): Iou threshold for nms.
+
+    Returns:
+        Tensor: Indices of selected boxes.
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    z1 = boxes[:, 2]
+    x2 = boxes[:, 3]
+    y2 = boxes[:, 4]
+    z2 = boxes[:, 5]
+    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+    zero = boxes.new_zeros(1, )
+
+    score_sorted = torch.argsort(scores)
+    pick = []
+    while (score_sorted.shape[0] != 0):
+        last = score_sorted.shape[0]
+        i = score_sorted[-1]
+        pick.append(i)
+
+        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+        classes1 = classes[i]
+        classes2 = classes[score_sorted[:last - 1]]
+        inter_l = torch.max(zero, xx2 - xx1)
+        inter_w = torch.max(zero, yy2 - yy1)
+        inter_h = torch.max(zero, zz2 - zz1)
+
+        inter = inter_l * inter_w * inter_h
+        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+        iou = iou * (classes1 == classes2).float()
+        score_sorted = score_sorted[torch.nonzero(iou <= thresh).flatten()]
+
+    indices = boxes.new_tensor(pick, dtype=torch.long)
+    return indices
--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -19,12 +19,14 @@ class Custom3DDataset(Dataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__()
        self.data_root = data_root
        self.ann_file = ann_file
        self.test_mode = test_mode
        self.modality = modality
+        self.filter_empty_gt = filter_empty_gt

        self.CLASSES = self.get_classes(classes)
        self.data_infos = self.load_annotations(self.ann_file)
@@ -52,7 +54,7 @@ class Custom3DDataset(Dataset):
        if not self.test_mode:
            annos = self.get_ann_info(index)
            input_dict['ann_info'] = annos
-            if len(annos['gt_bboxes_3d']) == 0:
+            if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
                return None
        return input_dict

@@ -67,7 +69,8 @@ class Custom3DDataset(Dataset):
            return None
        self.pre_pipeline(input_dict)
        example = self.pipeline(input_dict)
-        if example is None or len(example['gt_bboxes_3d']._data) == 0:
+        if self.filter_empty_gt and (example is None or len(
+                example['gt_bboxes_3d']._data) == 0):
            return None
        return example

@@ -124,10 +127,13 @@ class Custom3DDataset(Dataset):
            results (list[dict]): List of results.
            metric (str | list[str]): Metrics to be evaluated.
            iou_thr (list[float]): AP IoU thresholds.
+
        """
        from mmdet3d.core.evaluation import indoor_eval
        assert isinstance(
            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, f'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
        assert isinstance(
            results[0], dict
        ), f'Expect elements in results to be dict, got {type(results[0])}.'

--- a/mmdet3d/datasets/pipelines/__init__.py
+++ b/mmdet3d/datasets/pipelines/__init__.py
@@ -7,6 +7,7 @@ from .indoor_loading import (LoadAnnotations3D, LoadPointsFromFile,
                             NormalizePointsColor)
 from .indoor_sample import IndoorPointSample
 from .loading import LoadMultiViewImageFromFiles
+from .point_seg_class_mapping import PointSegClassMapping
 from .train_aug import (GlobalRotScale, ObjectNoise, ObjectRangeFilter,
                        ObjectSample, PointShuffle, PointsRangeFilter,
                        RandomFlip3D)
@@ -18,5 +19,5 @@ __all__ = [
    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
    'IndoorGlobalRotScale', 'IndoorPointsColorJitter', 'IndoorFlipData',
    'MMDataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D',
-    'IndoorPointSample'
+    'IndoorPointSample', 'PointSegClassMapping'
 ]
--- a/mmdet3d/datasets/pipelines/indoor_augment.py
+++ b/mmdet3d/datasets/pipelines/indoor_augment.py
@@ -224,7 +224,7 @@ class IndoorGlobalRotScale(object):
            results['scale_ratio'] = scale_ratio

        results['points'] = points
-        results['gt_bboxes_3d'] = gt_bboxes_3d
+        results['gt_bboxes_3d'] = gt_bboxes_3d.astype(np.float32)
        return results

    def __repr__(self):

--- a/mmdet3d/datasets/pipelines/indoor_loading.py
+++ b/mmdet3d/datasets/pipelines/indoor_loading.py
@@ -189,7 +189,8 @@ class LoadAnnotations3D(LoadAnnotations):
            self.file_client = mmcv.FileClient(**self.file_client_args)
        try:
            mask_bytes = self.file_client.get(pts_semantic_mask_path)
-            pts_semantic_mask = np.frombuffer(mask_bytes, dtype=np.int)
+            # add .copy() to fix read-only bug
+            pts_semantic_mask = np.frombuffer(mask_bytes, dtype=np.int).copy()
        except ConnectionError:
            mmcv.check_file_exist(pts_semantic_mask_path)
            pts_semantic_mask = np.fromfile(

--- a/mmdet3d/datasets/pipelines/point_seg_class_mapping.py
+++ b/mmdet3d/datasets/pipelines/point_seg_class_mapping.py
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class PointSegClassMapping(object):
+    """Map original semantic class to valid category ids.
+
+    Map valid classes as 0~len(valid_cat_ids)-1 and
+    others as len(valid_cat_ids).
+
+    Args:
+        valid_cat_ids (tuple[int): A tuple of valid category.
+    """
+
+    def __init__(self, valid_cat_ids):
+        self.valid_cat_ids = valid_cat_ids
+
+    def __call__(self, results):
+        assert 'pts_semantic_mask' in results
+        pts_semantic_mask = results['pts_semantic_mask']
+        neg_cls = len(self.valid_cat_ids)
+
+        for i in range(pts_semantic_mask.shape[0]):
+            if pts_semantic_mask[i] in self.valid_cat_ids:
+                converted_id = self.valid_cat_ids.index(pts_semantic_mask[i])
+                pts_semantic_mask[i] = converted_id
+            else:
+                pts_semantic_mask[i] = neg_cls
+
+        results['pts_semantic_mask'] = pts_semantic_mask
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(valid_cat_ids={})'.format(self.valid_cat_ids)
+        return repr_str
--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -20,9 +20,10 @@ class ScanNetDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__(data_root, ann_file, pipeline, classes, modality,
-                         test_mode)
+                         filter_empty_gt, test_mode)

    def get_ann_info(self, index):
        # Use index to get the annos, thus the evalhook could also use this api

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -16,9 +16,10 @@ class SUNRGBDDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__(data_root, ann_file, pipeline, classes, modality,
-                         test_mode)
+                         filter_empty_gt, test_mode)

    def get_ann_info(self, index):
        # Use index to get the annos, thus the evalhook could also use this api

--- a/mmdet3d/models/__init__.py
+++ b/mmdet3d/models/__init__.py
@@ -8,6 +8,7 @@ from .detectors import *  # noqa: F401,F403
 from .fusion_layers import *  # noqa: F401,F403
 from .losses import *  # noqa: F401,F403
 from .middle_encoders import *  # noqa: F401,F403
+from .model_utils import *  # noqa: F401,F403
 from .necks import *  # noqa: F401,F403
 from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS
 from .roi_heads import *  # noqa: F401,F403

--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
 from .anchor3d_head import Anchor3DHead
 from .parta2_rpn_head import PartA2RPNHead
+from .vote_head import VoteHead

-__all__ = ['Anchor3DHead', 'PartA2RPNHead']
+__all__ = ['Anchor3DHead', 'PartA2RPNHead', 'VoteHead']
--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmdet3d.core import build_bbox_coder, multi_apply
+from mmdet3d.core.bbox.box_torch_ops import boxes3d_to_corners3d_lidar_torch
+from mmdet3d.core.bbox.transforms import upright_depth_to_lidar_torch
+from mmdet3d.core.post_processing import aligned_3d_nms
+from mmdet3d.models.builder import build_loss
+from mmdet3d.models.losses import chamfer_distance
+from mmdet3d.models.model_utils import VoteModule
+from mmdet3d.ops import (PointSAModule, furthest_point_sample,
+                         points_in_boxes_batch)
+from mmdet.models import HEADS
+
+
+@HEADS.register_module()
+class VoteHead(nn.Module):
+    """Bbox head of Votenet.
+
+    https://arxiv.org/pdf/1904.09664.pdf
+
+    Args:
+        num_classes (int): The number of class.
+        bbox_coder (BaseBBoxCoder): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        vote_moudule_cfg (dict): Config of VoteModule for point-wise votes.
+        vote_aggregation_cfg (dict): Config of vote aggregation layer.
+        feat_channels (tuple[int]): Convolution channels of
+            prediction layer.
+        conv_cfg (dict): Config of convolution in prediction layer.
+        norm_cfg (dict): Config of BN in prediction layer.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        dir_class_loss (dict): Config of direction classification loss.
+        dir_res_loss (dict): Config of direction residual regression loss.
+        size_class_loss (dict): Config of size classification loss.
+        size_res_loss (dict): Config of size residual regression loss.
+        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 bbox_coder,
+                 train_cfg=None,
+                 test_cfg=None,
+                 vote_moudule_cfg=None,
+                 vote_aggregation_cfg=None,
+                 feat_channels=(128, 128),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 objectness_loss=None,
+                 center_loss=None,
+                 dir_class_loss=None,
+                 dir_res_loss=None,
+                 size_class_loss=None,
+                 size_res_loss=None,
+                 semantic_loss=None):
+        super(VoteHead, self).__init__()
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.gt_per_seed = vote_moudule_cfg['gt_per_seed']
+        self.num_proposal = vote_aggregation_cfg['num_point']
+
+        self.objectness_loss = build_loss(objectness_loss)
+        self.center_loss = build_loss(center_loss)
+        self.dir_class_loss = build_loss(dir_class_loss)
+        self.dir_res_loss = build_loss(dir_res_loss)
+        self.size_class_loss = build_loss(size_class_loss)
+        self.size_res_loss = build_loss(size_res_loss)
+        self.semantic_loss = build_loss(semantic_loss)
+
+        assert vote_aggregation_cfg['mlp_channels'][0] == vote_moudule_cfg[
+            'in_channels']
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        self.vote_module = VoteModule(**vote_moudule_cfg)
+        self.vote_aggregation = PointSAModule(**vote_aggregation_cfg)
+
+        prev_channel = vote_aggregation_cfg['mlp_channels'][-1]
+        conv_pred_list = list()
+        for k in range(len(feat_channels)):
+            conv_pred_list.append(
+                ConvModule(
+                    prev_channel,
+                    feat_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channel = feat_channels[k]
+        self.conv_pred = nn.Sequential(*conv_pred_list)
+
+        # Objectness scores (2), center residual (3),
+        # heading class+residual (num_dir_bins*2),
+        # size class+residual(num_sizes*4)
+        conv_out_channel = (2 + 3 + self.num_dir_bins * 2 +
+                            self.num_sizes * 4 + num_classes)
+        self.conv_pred.add_module('conv_out',
+                                  nn.Conv1d(prev_channel, conv_out_channel, 1))
+
+    def init_weights(self):
+        pass
+
+    def forward(self, feat_dict, sample_mod):
+        """Forward pass.
+
+        The forward of VoteHead is devided into 4 steps:
+            1. Generate vote_points from seed_points.
+            2. Aggregate vote_points.
+            3. Predict bbox and score.
+            4. Decode predictions.
+
+        Args:
+            feat_dict (dict): feature dict from backbone.
+            sample_mod (str): sample mode for vote aggregation layer.
+                valid modes are "vote", "seed" and "random".
+        """
+        assert sample_mod in ['vote', 'seed', 'random']
+
+        seed_points = feat_dict['fp_xyz'][-1]
+        seed_features = feat_dict['fp_features'][-1]
+        seed_indices = feat_dict['fp_indices'][-1]
+
+        # 1. generate vote_points from seed_points
+        vote_points, vote_features = self.vote_module(seed_points,
+                                                      seed_features)
+        results = dict(
+            seed_points=seed_points,
+            seed_indices=seed_indices,
+            vote_points=vote_points,
+            vote_features=vote_features)
+
+        # 2. aggregate vote_points
+        if sample_mod == 'vote':
+            # use fps in vote_aggregation
+            sample_indices = None
+        elif sample_mod == 'seed':
+            # FPS on seed and choose the votes corresponding to the seeds
+            sample_indices = furthest_point_sample(seed_points,
+                                                   self.num_proposal)
+        elif sample_mod == 'random':
+            # Random sampling from the votes
+            batch_size, num_seed = seed_points.shape[:2]
+            sample_indices = seed_points.new_tensor(
+                torch.randint(0, num_seed, (batch_size, self.num_proposal)),
+                dtype=torch.int32)
+        else:
+            raise NotImplementedError
+
+        vote_aggregation_ret = self.vote_aggregation(vote_points,
+                                                     vote_features,
+                                                     sample_indices)
+        aggregated_points, features, aggregated_indices = vote_aggregation_ret
+        results['aggregated_points'] = aggregated_points
+        results['aggregated_indices'] = aggregated_indices
+
+        # 3. predict bbox and score
+        predictions = self.conv_pred(features)
+
+        # 4. decode predictions
+        decode_res = self.bbox_coder.split_pred(predictions, aggregated_points)
+        results.update(decode_res)
+
+        return results
+
+    def loss(self,
+             bbox_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             pts_semantic_mask=None,
+             pts_instance_mask=None,
+             img_meta=None,
+             gt_bboxes_ignore=None):
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
+                                   pts_semantic_mask, pts_instance_mask,
+                                   bbox_preds)
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets, mask_targets,
+         valid_gt_masks, objectness_targets, objectness_weights,
+         box_loss_weights, valid_gt_weights) = targets
+
+        # calculate vote loss
+        vote_loss = self.vote_module.get_loss(bbox_preds['seed_points'],
+                                              bbox_preds['vote_points'],
+                                              bbox_preds['seed_indices'],
+                                              vote_target_masks, vote_targets)
+
+        # calculate objectness loss
+        objectness_loss = self.objectness_loss(
+            bbox_preds['obj_scores'].transpose(2, 1),
+            objectness_targets,
+            weight=objectness_weights)
+
+        # calculate center loss
+        source2target_loss, target2source_loss = self.center_loss(
+            bbox_preds['center'],
+            center_targets,
+            src_weight=box_loss_weights,
+            dst_weight=valid_gt_weights)
+        center_loss = source2target_loss + target2source_loss
+
+        # calculate direction class loss
+        dir_class_loss = self.dir_class_loss(
+            bbox_preds['dir_class'].transpose(2, 1),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        batch_size, proposal_num = size_class_targets.shape[:2]
+        heading_label_one_hot = vote_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        dir_res_norm = torch.sum(
+            bbox_preds['dir_res_norm'] * heading_label_one_hot, -1)
+        dir_res_loss = self.dir_res_loss(
+            dir_res_norm, dir_res_targets, weight=box_loss_weights)
+
+        # calculate size class loss
+        size_class_loss = self.size_class_loss(
+            bbox_preds['size_class'].transpose(2, 1),
+            size_class_targets,
+            weight=box_loss_weights)
+
+        # calculate size residual loss
+        one_hot_size_targets = vote_targets.new_zeros(
+            (batch_size, proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+            -1).repeat(1, 1, 1, 3)
+        size_residual_norm = torch.sum(
+            bbox_preds['size_res_norm'] * one_hot_size_targets_expand, 2)
+        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
+            1, 1, 3)
+        size_res_loss = self.size_res_loss(
+            size_residual_norm,
+            size_res_targets,
+            weight=box_loss_weights_expand)
+
+        # calculate semantic loss
+        semantic_loss = self.semantic_loss(
+            bbox_preds['sem_scores'].transpose(2, 1),
+            mask_targets,
+            weight=box_loss_weights)
+
+        losses = dict(
+            vote_loss=vote_loss,
+            objectness_loss=objectness_loss,
+            semantic_loss=semantic_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=size_class_loss,
+            size_res_loss=size_res_loss)
+        return losses
+
+    def get_targets(self,
+                    points,
+                    gt_bboxes_3d,
+                    gt_labels_3d,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    bbox_preds=None):
+        """Get targets of vote head.
+
+        Args:
+            points (list[Tensor]): Points of each batch.
+            gt_bboxes_3d (list[Tensor]): gt bboxes of each batch.
+            gt_labels_3d (list[Tensor]): gt class labels of each batch.
+            pts_semantic_mask (None | list[Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (None | list[Tensor]): point-wise instance
+                label of each batch.
+            bbox_preds (Tensor): Bbox predictions of vote head.
+
+        Returns:
+            tuple: Targets of vote head.
+        """
+
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_zeros(
+                    1, gt_bboxes_3d[index].shape[-1])
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(gt_labels_3d[index].new_ones(
+                    gt_labels_3d[index].shape))
+                gt_num.append(gt_labels_3d[index].shape[0])
+        max_gt_num = max(gt_num)
+
+        if pts_semantic_mask is None:
+            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
+            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
+
+        aggregated_points = [
+            bbox_preds['aggregated_points'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets, mask_targets,
+         objectness_targets, objectness_masks) = multi_apply(
+             self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
+             pts_semantic_mask, pts_instance_mask, aggregated_points)
+
+        # pad targets as original code of votenet.
+        for index in range(len(gt_labels_3d)):
+            pad_num = max_gt_num - gt_labels_3d[index].shape[0]
+            center_targets[index] = F.pad(center_targets[index],
+                                          (0, 0, 0, pad_num))
+            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
+
+        vote_targets = torch.stack(vote_targets)
+        vote_target_masks = torch.stack(vote_target_masks)
+        center_targets = torch.stack(center_targets)
+        valid_gt_masks = torch.stack(valid_gt_masks)
+
+        objectness_targets = torch.stack(objectness_targets)
+        objectness_weights = torch.stack(objectness_masks)
+        objectness_weights /= (torch.sum(objectness_weights) + 1e-6)
+        box_loss_weights = objectness_targets.float() / (
+            torch.sum(objectness_targets).float() + 1e-6)
+        valid_gt_weights = valid_gt_masks.float() / (
+            torch.sum(valid_gt_masks.float()) + 1e-6)
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_class_targets = torch.stack(size_class_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+
+        return (vote_targets, vote_target_masks, size_class_targets,
+                size_res_targets, dir_class_targets, dir_res_targets,
+                center_targets, mask_targets, valid_gt_masks,
+                objectness_targets, objectness_weights, box_loss_weights,
+                valid_gt_weights)
+
+    def get_targets_single(self,
+                           points,
+                           gt_bboxes_3d,
+                           gt_labels_3d,
+                           pts_semantic_mask=None,
+                           pts_instance_mask=None,
+                           aggregated_points=None):
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+
+        # generate votes target
+        num_points = points.shape[0]
+        if self.bbox_coder.with_rot:
+            points_lidar, gt_bboxes_3d_lidar = upright_depth_to_lidar_torch(
+                points, gt_bboxes_3d, to_bottom_center=True)
+            vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed])
+            vote_target_masks = points.new_zeros([num_points],
+                                                 dtype=torch.long)
+            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
+
+            box_indices_all = points_in_boxes_batch(
+                points_lidar.unsqueeze(0), gt_bboxes_3d_lidar.unsqueeze(0))[0]
+            for i in range(gt_bboxes_3d.shape[0]):
+                box_indices = box_indices_all[:, i]
+                indices = torch.nonzero(box_indices).squeeze(-1)
+                selected_points = points[indices]
+                vote_target_masks[indices] = 1
+                vote_targets_tmp = vote_targets[indices]
+                votes = gt_bboxes_3d[i][:3].unsqueeze(
+                    0) - selected_points[:, :3]
+
+                for j in range(self.gt_per_seed):
+                    column_indices = torch.nonzero(
+                        vote_target_idx[indices] == j).squeeze(-1)
+                    vote_targets_tmp[column_indices,
+                                     int(j * 3):int(j * 3 +
+                                                    3)] = votes[column_indices]
+                    if j == 0:
+                        vote_targets_tmp[column_indices] = votes[
+                            column_indices].repeat(1, self.gt_per_seed)
+
+                vote_targets[indices] = vote_targets_tmp
+                vote_target_idx[indices] = torch.clamp(
+                    vote_target_idx[indices] + 1, max=2)
+        elif pts_semantic_mask is not None:
+            vote_targets = points.new_zeros([num_points, 3])
+            vote_target_masks = points.new_zeros([num_points],
+                                                 dtype=torch.long)
+
+            for i in torch.unique(pts_instance_mask):
+                indices = torch.nonzero(pts_instance_mask == i).squeeze(-1)
+                if pts_semantic_mask[indices[0]] < self.num_classes:
+                    selected_points = points[indices, :3]
+                    center = 0.5 * (
+                        selected_points.min(0)[0] + selected_points.max(0)[0])
+                    vote_targets[indices, :] = center - selected_points
+                    vote_target_masks[indices] = 1
+            vote_targets = vote_targets.repeat((1, self.gt_per_seed))
+        else:
+            raise NotImplementedError
+
+        (center_targets, size_class_targets, size_res_targets,
+         dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        proposal_num = aggregated_points.shape[0]
+        distance1, _, assignment, _ = chamfer_distance(
+            aggregated_points.unsqueeze(0),
+            center_targets.unsqueeze(0),
+            reduction='none')
+        assignment = assignment.squeeze(0)
+        euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6)
+
+        objectness_targets = points.new_zeros((proposal_num), dtype=torch.long)
+        objectness_targets[
+            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1
+
+        objectness_masks = points.new_zeros((proposal_num))
+        objectness_masks[
+            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0
+        objectness_masks[
+            euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0
+
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        dir_res_targets /= (np.pi / self.num_dir_bins)
+        size_class_targets = size_class_targets[assignment]
+        size_res_targets = size_res_targets[assignment]
+
+        one_hot_size_targets = gt_bboxes_3d.new_zeros(
+            (proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat(
+            1, 1, 3)
+        mean_sizes = size_res_targets.new_tensor(
+            self.bbox_coder.mean_sizes).unsqueeze(0)
+        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
+        size_res_targets /= pos_mean_sizes
+
+        mask_targets = gt_labels_3d[assignment]
+
+        return (vote_targets, vote_target_masks, size_class_targets,
+                size_res_targets,
+                dir_class_targets, dir_res_targets, center_targets,
+                mask_targets.long(), objectness_targets, objectness_masks)
+
+    def get_bboxes(self, points, bbox_preds, img_meta, rescale=False):
+        # decode boxes
+        obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1]
+        sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
+        bbox_depth = self.bbox_coder.decode(bbox_preds)
+
+        points_lidar, bbox_lidar = upright_depth_to_lidar_torch(
+            points[..., :3], bbox_depth, to_bottom_center=True)
+        batch_size = bbox_depth.shape[0]
+
+        results = list()
+        for b in range(batch_size):
+            bbox_selected, score_selected, labels = self.multiclass_nms_single(
+                obj_scores[b], sem_scores[b], bbox_lidar[b], points_lidar[b])
+            results.append((bbox_selected, score_selected, labels))
+
+        return results
+
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox,
+                              points_lidar):
+        box_indices = points_in_boxes_batch(
+            points_lidar.unsqueeze(0), bbox.unsqueeze(0))[0]
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+
+        # boxes3d to aligned boxes
+        corner3d = boxes3d_to_corners3d_lidar_torch(bbox)
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(nonempty_box_mask).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected])
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected]
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
--- a/mmdet3d/models/detectors/__init__.py
+++ b/mmdet3d/models/detectors/__init__.py
@@ -4,10 +4,11 @@ from .mvx_faster_rcnn import (DynamicMVXFasterRCNN, DynamicMVXFasterRCNNV2,
 from .mvx_single_stage import MVXSingleStageDetector
 from .mvx_two_stage import MVXTwoStageDetector
 from .parta2 import PartA2
+from .votenet import VoteNet
 from .voxelnet import DynamicVoxelNet, VoxelNet

 __all__ = [
    'BaseDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXSingleStageDetector',
    'MVXTwoStageDetector', 'DynamicMVXFasterRCNN', 'DynamicMVXFasterRCNNV2',
-    'DynamicMVXFasterRCNNV3', 'PartA2'
+    'DynamicMVXFasterRCNNV3', 'PartA2', 'VoteNet'
 ]
--- a/mmdet3d/models/detectors/votenet.py
+++ b/mmdet3d/models/detectors/votenet.py
+import torch
+
+from mmdet3d.core import bbox3d2result
+from mmdet.models import DETECTORS, SingleStageDetector
+
+
+@DETECTORS.register_module()
+class VoteNet(SingleStageDetector):
+    """VoteNet model.
+
+    https://arxiv.org/pdf/1904.09664.pdf
+    """
+
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(VoteNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+
+    def extract_feat(self, points):
+        x = self.backbone(points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_train(self,
+                      points,
+                      img_meta,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      pts_semantic_mask=None,
+                      pts_instance_mask=None,
+                      gt_bboxes_ignore=None):
+        """Forward of training.
+
+        Args:
+            points (list[Tensor]): Points of each batch.
+            img_meta (list): Image metas.
+            gt_bboxes_3d (list[Tensor]): gt bboxes of each batch.
+            gt_labels_3d (list[Tensor]): gt class labels of each batch.
+            pts_semantic_mask (None | list[Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (None | list[Tensor]): point-wise instance
+                label of each batch.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding.
+
+        Returns:
+            dict: Losses.
+        """
+        points_cat = torch.stack(points)  # tmp
+
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
+        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
+                       pts_instance_mask, img_meta)
+        losses = self.bbox_head.loss(
+            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_test(self, **kwargs):
+        return self.simple_test(**kwargs)
+
+    def forward(self, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def simple_test(self,
+                    points,
+                    img_meta,
+                    gt_bboxes_3d=None,
+                    gt_labels_3d=None,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    rescale=False):
+        """Forward of testing.
+
+        Args:
+            points (list[Tensor]): Points of each sample.
+            img_meta (list): Image metas.
+            gt_bboxes_3d (list[Tensor]): gt bboxes of each sample.
+            gt_labels_3d (list[Tensor]): gt class labels of each sample.
+            pts_semantic_mask (None | list[Tensor]): point-wise semantic
+                label of each sample.
+            pts_instance_mask (None | list[Tensor]): point-wise instance
+                label of each sample.
+            rescale (bool): Whether to rescale results.
+
+        Returns:
+            list: Predicted 3d boxes.
+        """
+        points_cat = torch.stack(points)  # tmp
+
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+        bbox_list = self.bbox_head.get_bboxes(
+            points_cat, bbox_preds, img_meta, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results[0]