Merge branch 'votenet' into 'master'

Votenet See merge request open-mmlab/mmdet.3d!46

Merge branch 'votenet' into 'master'
Votenet See merge request open-mmlab/mmdet.3d!46
19a56f6b · zhangwenwei · ac3590a1 · f717eb62 · 19a56f6b · 19a56f6b
Commit 19a56f6b authored Jun 04, 2020 by zhangwenwei
20 changed files
--- a/configs/scannet/votenet_scannet-3d-18class.py
+++ b/configs/scannet/votenet_scannet-3d-18class.py
+# model settings
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        pool_mod='max'),
+    bbox_head=dict(
+        type='VoteHead',
+        num_classes=18,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        vote_moudule_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(128, 128),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+test_cfg = dict(
+    sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39)),
+    dict(type='IndoorPointSample', num_points=40000),
+    dict(type='IndoorFlipData', flip_ratio_yz=0.5, flip_ratio_xz=0.5),
+    dict(
+        type='IndoorGlobalRotScale',
+        shift_height=True,
+        rot_range=[-1 / 36, 1 / 36],
+        scale_range=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='IndoorPointSample', num_points=40000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points'])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.008  # max learning rate
+optimizer = dict(type='Adam', lr=lr)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/votenet_scannet'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/sunrgbd/votenet_sunrgbd-3d-10class.py
+++ b/configs/sunrgbd/votenet_sunrgbd-3d-10class.py
+# model settings
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        pool_mod='max'),
+    bbox_head=dict(
+        type='VoteHead',
+        num_classes=10,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=10,
+            num_dir_bins=12,
+            with_rot=True,
+            mean_sizes=[[2.114256, 1.620300, 0.927272],
+                        [0.791118, 1.279516, 0.718182],
+                        [0.923508, 1.867419, 0.845495],
+                        [0.591958, 0.552978, 0.827272],
+                        [0.699104, 0.454178, 0.75625],
+                        [0.69519, 1.346299, 0.736364],
+                        [0.528526, 1.002642, 1.172878],
+                        [0.500618, 0.632163, 0.683424],
+                        [0.404671, 1.071108, 1.688889],
+                        [0.76584, 1.398258, 0.472728]]),
+        vote_moudule_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(128, 128),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+test_cfg = dict(
+    sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)
+# dataset settings
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(type='IndoorFlipData', flip_ratio_yz=0.5),
+    dict(
+        type='IndoorGlobalRotScale',
+        shift_height=True,
+        rot_range=[-1 / 6, 1 / 6],
+        scale_range=[0.85, 1.15]),
+    dict(type='IndoorPointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='IndoorPointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points'])
+]
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.008  # max learning rate
+optimizer = dict(type='Adam', lr=lr)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=30,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/votenet_sunrgbd'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -8,7 +8,9 @@ from .samplers import (BaseSampler, CombinedSampler,
                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
                       PseudoSampler, RandomSampler, SamplingResult)
 from .structures import Box3DMode, CameraInstance3DBoxes, LiDARInstance3DBoxes
-from .transforms import bbox3d2result, bbox3d2roi, boxes3d_to_bev_torch_lidar
+from .transforms import (bbox3d2result, bbox3d2roi,
+                         box3d_to_corner3d_upright_depth,
+                         boxes3d_to_bev_torch_lidar)
 from .assign_sampling import (  # isort:skip, avoid recursive imports
    build_bbox_coder,  # temporally settings
@@ -22,5 +24,6 @@ __all__ = [
    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'boxes3d_to_bev_torch_lidar',
    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
    'bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes',
-    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result'
+    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result',
+    'box3d_to_corner3d_upright_depth'
 ]
--- a/mmdet3d/core/bbox/coders/__init__.py
+++ b/mmdet3d/core/bbox/coders/__init__.py
 from mmdet.core.bbox import build_bbox_coder
 from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
-__all__ = ['build_bbox_coder', 'DeltaXYZWLHRBBoxCoder']
+__all__ = [
+    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder'
+]
--- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+import numpy as np
+import torch
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+@BBOX_CODERS.register_module()
+class PartialBinBasedBBoxCoder(BaseBBoxCoder):
+    """Partial bin based bbox coder
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+    def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True):
+        super(PartialBinBasedBBoxCoder, self).__init__()
+        assert len(mean_sizes) == num_sizes
+        self.num_dir_bins = num_dir_bins
+        self.num_sizes = num_sizes
+        self.mean_sizes = mean_sizes
+        self.with_rot = with_rot
+    def encode(self, gt_bboxes_3d, gt_labels_3d):
+        """Encode ground truth to prediction targets.
+        Args:
+            gt_bboxes_3d (Tensor): 3d gt bboxes with shape (n, 7).
+            gt_labels_3d (Tensor): Gt classes.
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d[..., 0:3]
+        # generate bbox size target
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d[..., 3:6] - gt_bboxes_3d.new_tensor(
+            self.mean_sizes)[size_class_target]
+        # generate dir target
+        box_num = gt_bboxes_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d[..., 6])
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.new_zeros(box_num)
+        return (center_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+    def decode(self, bbox_out):
+        """Decode predicted parts to bbox3d.
+        Args:
+            bbox_out (dict): predictions from model, should contain keys below
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+        Returns:
+            Tensor: decoded bbox3d with shape (batch, n, 7)
+        """
+        center = bbox_out['center']
+        batch_size, num_proposal = center.shape[:2]
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class'], -1)
+            dir_res = torch.gather(bbox_out['dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+        # decode bbox size
+        size_class = torch.argmax(bbox_out['size_class'], -1, keepdim=True)
+        size_res = torch.gather(bbox_out['size_res'], 2,
+                                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+        mean_sizes = center.new_tensor(self.mean_sizes)
+        size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
+        bbox_size = size_base.reshape(batch_size, num_proposal,
+                                      -1) + size_res.squeeze(2)
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+    def split_pred(self, preds, base_xyz):
+        """Split predicted features to specific parts.
+        Args:
+            preds (Tensor): predicted features to split.
+            base_xyz (Tensor): coordinates of points.
+        Returns:
+            dict: split results.
+        """
+        results = {}
+        start, end = 0, 0
+        preds_trans = preds.transpose(2, 1)
+        # decode objectness score
+        end += 2
+        results['obj_scores'] = preds_trans[..., start:end]
+        start = end
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center'] = base_xyz + preds_trans[..., start:end]
+        start = end
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = preds_trans[..., start:end]
+        start = end
+        end += self.num_dir_bins
+        dir_res_norm = preds_trans[..., start:end]
+        start = end
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
+        # decode size
+        end += self.num_sizes
+        results['size_class'] = preds_trans[..., start:end]
+        start = end
+        end += self.num_sizes * 3
+        size_res_norm = preds_trans[..., start:end]
+        batch_size, num_proposal = preds_trans.shape[:2]
+        size_res_norm = size_res_norm.view(
+            [batch_size, num_proposal, self.num_sizes, 3])
+        start = end
+        results['size_res_norm'] = size_res_norm
+        mean_sizes = preds.new_tensor(self.mean_sizes)
+        results['size_res'] = (
+            size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+        # decode semantic score
+        results['sem_scores'] = preds_trans[..., start:]
+        return results
+    def angle2class(self, angle):
+        """Convert continuous angle to a discrete class and a residual.
+        Convert continuous angle to a discrete class and a small
+        regression number from class center angle to current angle.
+        Args:
+            angle (Tensor): Angle is from 0-2pi (or -pi~pi), class center at
+                0, 1*(2pi/N), 2*(2pi/N) ...  (N-1)*(2pi/N)
+        Returns:
+            tuple: Encoded discrete class and residual.
+        """
+        angle = angle % (2 * np.pi)
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+        angle_cls = shifted_angle // angle_per_class
+        angle_res = shifted_angle - (
+            angle_cls * angle_per_class + angle_per_class / 2)
+        return angle_cls.long(), angle_res
+    def class2angle(self, angle_cls, angle_res, limit_period=True):
+        """Inverse function to angle2class
+        Args:
+            angle_cls (Tensor): Angle class to decode.
+            angle_res (Tensor): Angle residual to decode.
+            limit_period (bool): Whether to limit angle to [-pi, pi].
+        Returns:
+            Tensor: angle decoded from angle_cls and angle_res.
+        """
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        angle_center = angle_cls.float() * angle_per_class
+        angle = angle_center + angle_res
+        if limit_period:
+            angle[angle > np.pi] -= 2 * np.pi
+        return angle
--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -84,3 +84,87 @@ def bbox3d2result(bboxes, scores, labels):
    """
    return dict(
        boxes_3d=bboxes.cpu(), scores_3d=scores.cpu(), labels_3d=labels.cpu())
+def upright_depth_to_lidar_torch(points=None,
+                                 bboxes=None,
+                                 to_bottom_center=False):
+    """Convert points and boxes in upright depth coordinate to lidar.
+    Args:
+        points (None | Tensor): points in upright depth coordinate.
+        bboxes (None | Tensor): bboxes in upright depth coordinate.
+        to_bottom_center (bool): covert bboxes to bottom center.
+    Returns:
+        tuple: points and bboxes in lidar coordinate.
+    """
+    if points is not None:
+        points_lidar = points.clone()
+        points_lidar = points_lidar[..., [1, 0, 2]]
+        points_lidar[..., 1] *= -1
+    else:
+        points_lidar = None
+    if bboxes is not None:
+        bboxes_lidar = bboxes.clone()
+        bboxes_lidar = bboxes_lidar[..., [1, 0, 2, 4, 3, 5, 6]]
+        bboxes_lidar[..., 1] *= -1
+        if to_bottom_center:
+            bboxes_lidar[..., 2] -= 0.5 * bboxes_lidar[..., 5]
+    else:
+        bboxes_lidar = None
+    return points_lidar, bboxes_lidar
+def box3d_to_corner3d_upright_depth(boxes3d):
+    """Convert box3d to corner3d in upright depth coordinate
+    Args:
+        boxes3d (Tensor): boxes with shape [n,7] in upright depth coordinate
+    Returns:
+        Tensor: boxes with [n, 8, 3] in upright depth coordinate
+    """
+    boxes_num = boxes3d.shape[0]
+    ry = boxes3d[:, 6:7]
+    l, w, h = boxes3d[:, 3:4], boxes3d[:, 4:5], boxes3d[:, 5:6]
+    zeros = boxes3d.new_zeros((boxes_num, 1))
+    ones = boxes3d.new_ones((boxes_num, 1))
+    # zeros = torch.cuda.FloatTensor(boxes_num, 1).fill_(0)
+    # ones = torch.cuda.FloatTensor(boxes_num, 1).fill_(1)
+    x_corners = torch.cat(
+        [-l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2., -l / 2.],
+        dim=1)  # (N, 8)
+    y_corners = torch.cat(
+        [w / 2., w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2.],
+        dim=1)  # (N, 8)
+    z_corners = torch.cat(
+        [h / 2., h / 2., h / 2., h / 2., -h / 2., -h / 2., -h / 2., -h / 2.],
+        dim=1)  # (N, 8)
+    temp_corners = torch.cat(
+        (x_corners.unsqueeze(dim=2), y_corners.unsqueeze(dim=2),
+         z_corners.unsqueeze(dim=2)),
+        dim=2)  # (N, 8, 3)
+    cosa, sina = torch.cos(-ry), torch.sin(-ry)
+    raw_1 = torch.cat([cosa, -sina, zeros], dim=1)  # (N, 3)
+    raw_2 = torch.cat([sina, cosa, zeros], dim=1)  # (N, 3)
+    raw_3 = torch.cat([zeros, zeros, ones], dim=1)  # (N, 3)
+    R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1),
+                   raw_3.unsqueeze(dim=1)),
+                  dim=1)  # (N, 3, 3)
+    rotated_corners = torch.matmul(temp_corners, R)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+    x = x_loc.view(-1, 1) + x_corners.view(-1, 8)
+    y = y_loc.view(-1, 1) + y_corners.view(-1, 8)
+    z = z_loc.view(-1, 1) + z_corners.view(-1, 8)
+    corners3d = torch.cat(
+        (x.view(-1, 8, 1), y.view(-1, 8, 1), z.view(-1, 8, 1)), dim=2)
+    return corners3d
--- a/mmdet3d/core/post_processing/__init__.py
+++ b/mmdet3d/core/post_processing/__init__.py
 from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
                                        merge_aug_proposals, merge_aug_scores,
                                        multiclass_nms)
-from .box3d_nms import box3d_multiclass_nms
+from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms
 __all__ = [
    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
-    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms'
+    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
+    'aligned_3d_nms'
 ]
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -64,3 +64,52 @@ def box3d_multiclass_nms(mlvl_bboxes,
        labels = mlvl_scores.new_zeros((0, mlvl_scores.size(-1)))
        dir_scores = mlvl_scores.new_zeros((0, ))
    return bboxes, scores, labels, dir_scores
+def aligned_3d_nms(boxes, scores, classes, thresh):
+    """3d nms for aligned boxes.
+    Args:
+        boxes (Tensor): Aligned box with shape [n, 6].
+        scores (Tensor): Scores of each box.
+        classes (Tensor): Class of each box.
+        thresh (float): Iou threshold for nms.
+    Returns:
+        Tensor: Indices of selected boxes.
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    z1 = boxes[:, 2]
+    x2 = boxes[:, 3]
+    y2 = boxes[:, 4]
+    z2 = boxes[:, 5]
+    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+    zero = boxes.new_zeros(1, )
+    score_sorted = torch.argsort(scores)
+    pick = []
+    while (score_sorted.shape[0] != 0):
+        last = score_sorted.shape[0]
+        i = score_sorted[-1]
+        pick.append(i)
+        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+        classes1 = classes[i]
+        classes2 = classes[score_sorted[:last - 1]]
+        inter_l = torch.max(zero, xx2 - xx1)
+        inter_w = torch.max(zero, yy2 - yy1)
+        inter_h = torch.max(zero, zz2 - zz1)
+        inter = inter_l * inter_w * inter_h
+        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+        iou = iou * (classes1 == classes2).float()
+        score_sorted = score_sorted[torch.nonzero(iou <= thresh).flatten()]
+    indices = boxes.new_tensor(pick, dtype=torch.long)
+    return indices
--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -19,12 +19,14 @@ class Custom3DDataset(Dataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__()
        self.data_root = data_root
        self.ann_file = ann_file
        self.test_mode = test_mode
        self.modality = modality
+        self.filter_empty_gt = filter_empty_gt
        self.CLASSES = self.get_classes(classes)
        self.data_infos = self.load_annotations(self.ann_file)
@@ -52,7 +54,7 @@ class Custom3DDataset(Dataset):
        if not self.test_mode:
            annos = self.get_ann_info(index)
            input_dict['ann_info'] = annos
-            if len(annos['gt_bboxes_3d']) == 0:
+            if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
                return None
        return input_dict
@@ -67,7 +69,8 @@ class Custom3DDataset(Dataset):
            return None
        self.pre_pipeline(input_dict)
        example = self.pipeline(input_dict)
-        if example is None or len(example['gt_bboxes_3d']._data) == 0:
+        if self.filter_empty_gt and (example is None or len(
+                example['gt_bboxes_3d']._data) == 0):
            return None
        return example
@@ -124,10 +127,13 @@ class Custom3DDataset(Dataset):
            results (list[dict]): List of results.
            metric (str | list[str]): Metrics to be evaluated.
            iou_thr (list[float]): AP IoU thresholds.
        """
        from mmdet3d.core.evaluation import indoor_eval
        assert isinstance(
            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, f'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
        assert isinstance(
            results[0], dict
        ), f'Expect elements in results to be dict, got {type(results[0])}.'

--- a/mmdet3d/datasets/pipelines/__init__.py
+++ b/mmdet3d/datasets/pipelines/__init__.py
@@ -7,6 +7,7 @@ from .indoor_loading import (LoadAnnotations3D, LoadPointsFromFile,
                             NormalizePointsColor)
 from .indoor_sample import IndoorPointSample
 from .loading import LoadMultiViewImageFromFiles
+from .point_seg_class_mapping import PointSegClassMapping
 from .train_aug import (GlobalRotScale, ObjectNoise, ObjectRangeFilter,
                        ObjectSample, PointShuffle, PointsRangeFilter,
                        RandomFlip3D)
@@ -18,5 +19,5 @@ __all__ = [
    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
    'IndoorGlobalRotScale', 'IndoorPointsColorJitter', 'IndoorFlipData',
    'MMDataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D',
-    'IndoorPointSample'
+    'IndoorPointSample', 'PointSegClassMapping'
 ]
--- a/mmdet3d/datasets/pipelines/indoor_augment.py
+++ b/mmdet3d/datasets/pipelines/indoor_augment.py
@@ -224,7 +224,7 @@ class IndoorGlobalRotScale(object):
            results['scale_ratio'] = scale_ratio
        results['points'] = points
-        results['gt_bboxes_3d'] = gt_bboxes_3d
+        results['gt_bboxes_3d'] = gt_bboxes_3d.astype(np.float32)
        return results
    def __repr__(self):

--- a/mmdet3d/datasets/pipelines/indoor_loading.py
+++ b/mmdet3d/datasets/pipelines/indoor_loading.py
@@ -189,7 +189,8 @@ class LoadAnnotations3D(LoadAnnotations):
            self.file_client = mmcv.FileClient(**self.file_client_args)
        try:
            mask_bytes = self.file_client.get(pts_semantic_mask_path)
-            pts_semantic_mask = np.frombuffer(mask_bytes, dtype=np.int)
+            # add .copy() to fix read-only bug
+            pts_semantic_mask = np.frombuffer(mask_bytes, dtype=np.int).copy()
        except ConnectionError:
            mmcv.check_file_exist(pts_semantic_mask_path)
            pts_semantic_mask = np.fromfile(

--- a/mmdet3d/datasets/pipelines/point_seg_class_mapping.py
+++ b/mmdet3d/datasets/pipelines/point_seg_class_mapping.py
+from mmdet.datasets.builder import PIPELINES
+@PIPELINES.register_module()
+class PointSegClassMapping(object):
+    """Map original semantic class to valid category ids.
+    Map valid classes as 0~len(valid_cat_ids)-1 and
+    others as len(valid_cat_ids).
+    Args:
+        valid_cat_ids (tuple[int): A tuple of valid category.
+    """
+    def __init__(self, valid_cat_ids):
+        self.valid_cat_ids = valid_cat_ids
+    def __call__(self, results):
+        assert 'pts_semantic_mask' in results
+        pts_semantic_mask = results['pts_semantic_mask']
+        neg_cls = len(self.valid_cat_ids)
+        for i in range(pts_semantic_mask.shape[0]):
+            if pts_semantic_mask[i] in self.valid_cat_ids:
+                converted_id = self.valid_cat_ids.index(pts_semantic_mask[i])
+                pts_semantic_mask[i] = converted_id
+            else:
+                pts_semantic_mask[i] = neg_cls
+        results['pts_semantic_mask'] = pts_semantic_mask
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(valid_cat_ids={})'.format(self.valid_cat_ids)
+        return repr_str
--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -20,9 +20,10 @@ class ScanNetDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__(data_root, ann_file, pipeline, classes, modality,
-                         test_mode)
+                         filter_empty_gt, test_mode)
    def get_ann_info(self, index):
        # Use index to get the annos, thus the evalhook could also use this api

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -16,9 +16,10 @@ class SUNRGBDDataset(Custom3DDataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__(data_root, ann_file, pipeline, classes, modality,
-                         test_mode)
+                         filter_empty_gt, test_mode)
    def get_ann_info(self, index):
        # Use index to get the annos, thus the evalhook could also use this api

--- a/mmdet3d/models/__init__.py
+++ b/mmdet3d/models/__init__.py
@@ -8,6 +8,7 @@ from .detectors import *  # noqa: F401,F403
 from .fusion_layers import *  # noqa: F401,F403
 from .losses import *  # noqa: F401,F403
 from .middle_encoders import *  # noqa: F401,F403
+from .model_utils import *  # noqa: F401,F403
 from .necks import *  # noqa: F401,F403
 from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS
 from .roi_heads import *  # noqa: F401,F403

--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
 from .anchor3d_head import Anchor3DHead
 from .parta2_rpn_head import PartA2RPNHead
+from .vote_head import VoteHead
-__all__ = ['Anchor3DHead', 'PartA2RPNHead']
+__all__ = ['Anchor3DHead', 'PartA2RPNHead', 'VoteHead']
--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
--- a/mmdet3d/models/detectors/__init__.py
+++ b/mmdet3d/models/detectors/__init__.py
@@ -4,10 +4,11 @@ from .mvx_faster_rcnn import (DynamicMVXFasterRCNN, DynamicMVXFasterRCNNV2,
 from .mvx_single_stage import MVXSingleStageDetector
 from .mvx_two_stage import MVXTwoStageDetector
 from .parta2 import PartA2
+from .votenet import VoteNet
 from .voxelnet import DynamicVoxelNet, VoxelNet
 __all__ = [
    'BaseDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXSingleStageDetector',
    'MVXTwoStageDetector', 'DynamicMVXFasterRCNN', 'DynamicMVXFasterRCNNV2',
-    'DynamicMVXFasterRCNNV3', 'PartA2'
+    'DynamicMVXFasterRCNNV3', 'PartA2', 'VoteNet'
 ]
--- a/mmdet3d/models/detectors/votenet.py
+++ b/mmdet3d/models/detectors/votenet.py
+import torch
+from mmdet3d.core import bbox3d2result
+from mmdet.models import DETECTORS, SingleStageDetector
+@DETECTORS.register_module()
+class VoteNet(SingleStageDetector):
+    """VoteNet model.
+    https://arxiv.org/pdf/1904.09664.pdf
+    """
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(VoteNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+    def extract_feat(self, points):
+        x = self.backbone(points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+    def forward_train(self,
+                      points,
+                      img_meta,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      pts_semantic_mask=None,
+                      pts_instance_mask=None,
+                      gt_bboxes_ignore=None):
+        """Forward of training.
+        Args:
+            points (list[Tensor]): Points of each batch.
+            img_meta (list): Image metas.
+            gt_bboxes_3d (list[Tensor]): gt bboxes of each batch.
+            gt_labels_3d (list[Tensor]): gt class labels of each batch.
+            pts_semantic_mask (None | list[Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (None | list[Tensor]): point-wise instance
+                label of each batch.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding.
+        Returns:
+            dict: Losses.
+        """
+        points_cat = torch.stack(points)  # tmp
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
+        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
+                       pts_instance_mask, img_meta)
+        losses = self.bbox_head.loss(
+            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+    def forward_test(self, **kwargs):
+        return self.simple_test(**kwargs)
+    def forward(self, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+    def simple_test(self,
+                    points,
+                    img_meta,
+                    gt_bboxes_3d=None,
+                    gt_labels_3d=None,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    rescale=False):
+        """Forward of testing.
+        Args:
+            points (list[Tensor]): Points of each sample.
+            img_meta (list): Image metas.
+            gt_bboxes_3d (list[Tensor]): gt bboxes of each sample.
+            gt_labels_3d (list[Tensor]): gt class labels of each sample.
+            pts_semantic_mask (None | list[Tensor]): point-wise semantic
+                label of each sample.
+            pts_instance_mask (None | list[Tensor]): point-wise instance
+                label of each sample.
+            rescale (bool): Whether to rescale results.
+        Returns:
+            list: Predicted 3d boxes.
+        """
+        points_cat = torch.stack(points)  # tmp
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+        bbox_list = self.bbox_head.get_bboxes(
+            points_cat, bbox_preds, img_meta, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results[0]