merge master

f27d308f · yinchimaoliang · c66ae813 · 27ebcfac · f27d308f · f27d308f
Commit f27d308f authored Jun 07, 2020 by yinchimaoliang
20 changed files
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -3,6 +3,6 @@ line_length = 79
 multi_line_output = 0
 known_standard_library = setuptools
 known_first_party = mmdet,mmdet3d
-known_third_party = cv2,mmcv,numba,numpy,nuscenes,plyfile,pycocotools,pyquaternion,pytest,scipy,shapely,six,skimage,torch,torchvision
+known_third_party = cv2,mmcv,numba,numpy,nuscenes,plyfile,pycocotools,pyquaternion,pytest,scipy,shapely,six,skimage,terminaltables,torch,torchvision
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,6 +15,7 @@ repos:
    rev: v0.30.0
    hooks:
      - id: yapf
+
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v2.5.0
    hooks:

--- a/configs/fileclient/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+++ b/configs/fileclient/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pts_voxel_layer=dict(
+        max_num_points=64,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000),  # (training, testing) max_coxels
+    ),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        num_input_features=4,
+        num_filters=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[400, 400],  # checked from PointCloud3D
+    ),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256],
+    ),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128],
+    ),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
+                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
+                [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
+                [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
+                [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
+                [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
+                [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
+            ],
+            sizes=[
+                [1.95017717, 4.60718145, 1.72270761],  # car
+                [2.4560939, 6.73778078, 2.73004906],  # truck
+                [2.87427237, 12.01320693, 3.81509561],  # trailer
+                [0.60058911, 1.68452161, 1.27192197],  # bicycle
+                [0.66344886, 0.7256437, 1.75748069],  # pedestrian
+                [0.39694519, 0.40359262, 1.06232151],  # traffic_cone
+                [2.49008838, 0.48578221, 0.98297065],  # barrier
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_num=500
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+    ))
+
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(),
+    classes=class_names,
+    sample_groups=dict(
+        bus=4,
+        trailer=4,
+        truck=4,
+    ))
+
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/': 's3://nuscenes/nuscenes/',
+        'data/nuscenes/': 's3://nuscenes/nuscenes/'
+    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points']),
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=24)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 24
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/fileclient/hv_second_secfpn_6x8_80e_fileclient_kitti-3d-car.py
+++ b/configs/fileclient/hv_second_secfpn_6x8_80e_fileclient_kitti-3d-car.py
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000),  # (training, testing) max_coxels
+    ),
+    voxel_encoder=dict(
+        type='VoxelFeatureExtractorV3',
+        num_input_features=4,
+        num_filters=[4],
+        with_distance=False),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256],
+    ),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+        pos_iou_thr=0.6,
+        neg_iou_thr=0.45,
+        min_pos_iou=0.45,
+        ignore_iof_thr=-1),
+    allowed_border=0,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_thr=0.01,
+    score_thr=0.3,
+    min_bbox_size=0,
+    nms_pre=100,
+    max_num=50)
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(
+    use_lidar=False,
+    use_lidar_reduced=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5),
+    ),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+)
+file_client_args = dict(
+    backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points']),
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.0018  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 80
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/sec_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
@@ -255,13 +255,11 @@ optimizer = dict(type='AdamW', lr=0.003, betas=(0.95, 0.99), weight_decay=0.01)
 # max_norm=10 is better for SECOND
 optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
 lr_config = dict(
-    policy='cosine',
+    policy='CosineAnealing',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=1.0 / 10,
-    target_lr=1e-5,
-    as_ratio=True,
-)
+    min_lr_ratio=1e-5)
 momentum_config = None
 checkpoint_config = dict(interval=1)
 # yapf:disable

--- a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
+++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
@@ -207,13 +207,11 @@ optimizer = dict(
    weight_decay=0.001)
 optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
 lr_config = dict(
-    policy='cosine',
+    policy='CosineAnealing',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=1.0 / 10,
-    target_lr=1e-5,
-    as_ratio=True,
-)
+    min_lr_ratio=1e-5)
 momentum_config = None
 checkpoint_config = dict(interval=1)
 # yapf:disable

--- a/configs/scannet/votenet_8x8_scannet-3d-18class.py
+++ b/configs/scannet/votenet_8x8_scannet-3d-18class.py
+# model settings
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        pool_mod='max'),
+    bbox_head=dict(
+        type='VoteHead',
+        num_classes=18,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        vote_moudule_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(128, 128),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+test_cfg = dict(
+    sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39)),
+    dict(type='IndoorPointSample', num_points=40000),
+    dict(type='IndoorFlipData', flip_ratio_yz=0.5, flip_ratio_xz=0.5),
+    dict(
+        type='IndoorGlobalRotScale',
+        shift_height=True,
+        rot_range=[-1 / 36, 1 / 36],
+        scale_range=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='IndoorPointSample', num_points=40000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.008  # max learning rate
+optimizer = dict(type='Adam', lr=lr)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/votenet_scannet'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/sunrgbd/votenet_16x8_sunrgbd-3d-10class.py
+++ b/configs/sunrgbd/votenet_16x8_sunrgbd-3d-10class.py
+# model settings
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        pool_mod='max'),
+    bbox_head=dict(
+        type='VoteHead',
+        num_classes=10,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=10,
+            num_dir_bins=12,
+            with_rot=True,
+            mean_sizes=[[2.114256, 1.620300, 0.927272],
+                        [0.791118, 1.279516, 0.718182],
+                        [0.923508, 1.867419, 0.845495],
+                        [0.591958, 0.552978, 0.827272],
+                        [0.699104, 0.454178, 0.75625],
+                        [0.69519, 1.346299, 0.736364],
+                        [0.528526, 1.002642, 1.172878],
+                        [0.500618, 0.632163, 0.683424],
+                        [0.404671, 1.071108, 1.688889],
+                        [0.76584, 1.398258, 0.472728]]),
+        vote_moudule_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        feat_channels=(128, 128),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')
+test_cfg = dict(
+    sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)
+
+# dataset settings
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(type='IndoorFlipData', flip_ratio_yz=0.5),
+    dict(
+        type='IndoorGlobalRotScale',
+        shift_height=True,
+        rot_range=[-1 / 6, 1 / 6],
+        scale_range=[0.85, 1.15]),
+    dict(type='IndoorPointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='IndoorPointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True))
+# optimizer
+lr = 0.008  # max learning rate
+optimizer = dict(type='Adam', lr=lr)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=30,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 36
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/votenet_sunrgbd'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -8,7 +8,9 @@ from .samplers import (BaseSampler, CombinedSampler,
                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
                       PseudoSampler, RandomSampler, SamplingResult)
 from .structures import Box3DMode, CameraInstance3DBoxes, LiDARInstance3DBoxes
-from .transforms import bbox3d2result, bbox3d2roi, boxes3d_to_bev_torch_lidar
+from .transforms import (bbox3d2result, bbox3d2roi,
+                         box3d_to_corner3d_upright_depth,
+                         boxes3d_to_bev_torch_lidar)

 from .assign_sampling import (  # isort:skip, avoid recursive imports
    build_bbox_coder,  # temporally settings
@@ -22,5 +24,6 @@ __all__ = [
    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'boxes3d_to_bev_torch_lidar',
    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
    'bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes',
-    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result'
+    'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result',
+    'box3d_to_corner3d_upright_depth'
 ]
--- a/mmdet3d/core/bbox/coders/__init__.py
+++ b/mmdet3d/core/bbox/coders/__init__.py
 from mmdet.core.bbox import build_bbox_coder
 from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder

-__all__ = ['build_bbox_coder', 'DeltaXYZWLHRBBoxCoder']
+__all__ = [
+    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder'
+]
--- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class PartialBinBasedBBoxCoder(BaseBBoxCoder):
+    """Partial bin based bbox coder
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+
+    def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True):
+        super(PartialBinBasedBBoxCoder, self).__init__()
+        assert len(mean_sizes) == num_sizes
+        self.num_dir_bins = num_dir_bins
+        self.num_sizes = num_sizes
+        self.mean_sizes = mean_sizes
+        self.with_rot = with_rot
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (Tensor): 3d gt bboxes with shape (n, 7).
+            gt_labels_3d (Tensor): Gt classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d[..., 0:3]
+
+        # generate bbox size target
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d[..., 3:6] - gt_bboxes_3d.new_tensor(
+            self.mean_sizes)[size_class_target]
+
+        # generate dir target
+        box_num = gt_bboxes_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d[..., 6])
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.new_zeros(box_num)
+
+        return (center_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+
+    def decode(self, bbox_out):
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): predictions from model, should contain keys below
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+
+        Returns:
+            Tensor: decoded bbox3d with shape (batch, n, 7)
+        """
+        center = bbox_out['center']
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class'], -1)
+            dir_res = torch.gather(bbox_out['dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        size_class = torch.argmax(bbox_out['size_class'], -1, keepdim=True)
+        size_res = torch.gather(bbox_out['size_res'], 2,
+                                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+        mean_sizes = center.new_tensor(self.mean_sizes)
+        size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
+        bbox_size = size_base.reshape(batch_size, num_proposal,
+                                      -1) + size_res.squeeze(2)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def split_pred(self, preds, base_xyz):
+        """Split predicted features to specific parts.
+
+        Args:
+            preds (Tensor): predicted features to split.
+            base_xyz (Tensor): coordinates of points.
+
+        Returns:
+            dict: split results.
+        """
+        results = {}
+        start, end = 0, 0
+        preds_trans = preds.transpose(2, 1)
+
+        # decode objectness score
+        end += 2
+        results['obj_scores'] = preds_trans[..., start:end]
+        start = end
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center'] = base_xyz + preds_trans[..., start:end]
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = preds_trans[..., start:end]
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = preds_trans[..., start:end]
+        start = end
+
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
+
+        # decode size
+        end += self.num_sizes
+        results['size_class'] = preds_trans[..., start:end]
+        start = end
+
+        end += self.num_sizes * 3
+        size_res_norm = preds_trans[..., start:end]
+        batch_size, num_proposal = preds_trans.shape[:2]
+        size_res_norm = size_res_norm.view(
+            [batch_size, num_proposal, self.num_sizes, 3])
+        start = end
+
+        results['size_res_norm'] = size_res_norm
+        mean_sizes = preds.new_tensor(self.mean_sizes)
+        results['size_res'] = (
+            size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+
+        # decode semantic score
+        results['sem_scores'] = preds_trans[..., start:]
+
+        return results
+
+    def angle2class(self, angle):
+        """Convert continuous angle to a discrete class and a residual.
+
+        Convert continuous angle to a discrete class and a small
+        regression number from class center angle to current angle.
+
+        Args:
+            angle (Tensor): Angle is from 0-2pi (or -pi~pi), class center at
+                0, 1*(2pi/N), 2*(2pi/N) ...  (N-1)*(2pi/N)
+
+        Returns:
+            tuple: Encoded discrete class and residual.
+        """
+        angle = angle % (2 * np.pi)
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+        angle_cls = shifted_angle // angle_per_class
+        angle_res = shifted_angle - (
+            angle_cls * angle_per_class + angle_per_class / 2)
+        return angle_cls.long(), angle_res
+
+    def class2angle(self, angle_cls, angle_res, limit_period=True):
+        """Inverse function to angle2class
+
+        Args:
+            angle_cls (Tensor): Angle class to decode.
+            angle_res (Tensor): Angle residual to decode.
+            limit_period (bool): Whether to limit angle to [-pi, pi].
+
+        Returns:
+            Tensor: angle decoded from angle_cls and angle_res.
+        """
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        angle_center = angle_cls.float() * angle_per_class
+        angle = angle_center + angle_res
+        if limit_period:
+            angle[angle > np.pi] -= 2 * np.pi
+        return angle
--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -84,3 +84,87 @@ def bbox3d2result(bboxes, scores, labels):
    """
    return dict(
        boxes_3d=bboxes.cpu(), scores_3d=scores.cpu(), labels_3d=labels.cpu())
+
+
+def upright_depth_to_lidar_torch(points=None,
+                                 bboxes=None,
+                                 to_bottom_center=False):
+    """Convert points and boxes in upright depth coordinate to lidar.
+
+    Args:
+        points (None | Tensor): points in upright depth coordinate.
+        bboxes (None | Tensor): bboxes in upright depth coordinate.
+        to_bottom_center (bool): covert bboxes to bottom center.
+
+    Returns:
+        tuple: points and bboxes in lidar coordinate.
+    """
+    if points is not None:
+        points_lidar = points.clone()
+        points_lidar = points_lidar[..., [1, 0, 2]]
+        points_lidar[..., 1] *= -1
+    else:
+        points_lidar = None
+
+    if bboxes is not None:
+        bboxes_lidar = bboxes.clone()
+        bboxes_lidar = bboxes_lidar[..., [1, 0, 2, 4, 3, 5, 6]]
+        bboxes_lidar[..., 1] *= -1
+        if to_bottom_center:
+            bboxes_lidar[..., 2] -= 0.5 * bboxes_lidar[..., 5]
+    else:
+        bboxes_lidar = None
+
+    return points_lidar, bboxes_lidar
+
+
+def box3d_to_corner3d_upright_depth(boxes3d):
+    """Convert box3d to corner3d in upright depth coordinate
+
+    Args:
+        boxes3d (Tensor): boxes with shape [n,7] in upright depth coordinate
+
+    Returns:
+        Tensor: boxes with [n, 8, 3] in upright depth coordinate
+    """
+    boxes_num = boxes3d.shape[0]
+    ry = boxes3d[:, 6:7]
+    l, w, h = boxes3d[:, 3:4], boxes3d[:, 4:5], boxes3d[:, 5:6]
+    zeros = boxes3d.new_zeros((boxes_num, 1))
+    ones = boxes3d.new_ones((boxes_num, 1))
+    # zeros = torch.cuda.FloatTensor(boxes_num, 1).fill_(0)
+    # ones = torch.cuda.FloatTensor(boxes_num, 1).fill_(1)
+    x_corners = torch.cat(
+        [-l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2., -l / 2.],
+        dim=1)  # (N, 8)
+    y_corners = torch.cat(
+        [w / 2., w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2.],
+        dim=1)  # (N, 8)
+    z_corners = torch.cat(
+        [h / 2., h / 2., h / 2., h / 2., -h / 2., -h / 2., -h / 2., -h / 2.],
+        dim=1)  # (N, 8)
+    temp_corners = torch.cat(
+        (x_corners.unsqueeze(dim=2), y_corners.unsqueeze(dim=2),
+         z_corners.unsqueeze(dim=2)),
+        dim=2)  # (N, 8, 3)
+
+    cosa, sina = torch.cos(-ry), torch.sin(-ry)
+    raw_1 = torch.cat([cosa, -sina, zeros], dim=1)  # (N, 3)
+    raw_2 = torch.cat([sina, cosa, zeros], dim=1)  # (N, 3)
+    raw_3 = torch.cat([zeros, zeros, ones], dim=1)  # (N, 3)
+    R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1),
+                   raw_3.unsqueeze(dim=1)),
+                  dim=1)  # (N, 3, 3)
+    rotated_corners = torch.matmul(temp_corners, R)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.view(-1, 1) + x_corners.view(-1, 8)
+    y = y_loc.view(-1, 1) + y_corners.view(-1, 8)
+    z = z_loc.view(-1, 1) + z_corners.view(-1, 8)
+    corners3d = torch.cat(
+        (x.view(-1, 8, 1), y.view(-1, 8, 1), z.view(-1, 8, 1)), dim=2)
+
+    return corners3d
--- a/mmdet3d/core/evaluation/indoor_eval.py
+++ b/mmdet3d/core/evaluation/indoor_eval.py
 import numpy as np
 import torch
+from mmcv.utils import print_log
+from terminaltables import AsciiTable

 from mmdet3d.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps_3d

@@ -263,14 +265,14 @@ def eval_map_recall(det_infos, gt_infos, ovthresh=None):
                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
                    label] = ret_values[i][iou_idx]
            else:
-                recall[iou_idx][label] = [0]
-                precision[iou_idx][label] = [0]
-                ap[iou_idx][label] = [0]
+                recall[iou_idx][label] = np.zeros(1)
+                precision[iou_idx][label] = np.zeros(1)
+                ap[iou_idx][label] = np.zeros(1)

    return recall, precision, ap


-def indoor_eval(gt_annos, dt_annos, metric, label2cat):
+def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None):
    """Scannet Evaluation.

    Evaluate the result of the detection.
@@ -280,6 +282,8 @@ def indoor_eval(gt_annos, dt_annos, metric, label2cat):
        dt_annos (list[dict]): Detection annotations.
        metric (list[float]): AP IoU thresholds.
        label2cat (dict): {label: cat}.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.

    Return:
        dict: Dict of results.
@@ -301,20 +305,41 @@ def indoor_eval(gt_annos, dt_annos, metric, label2cat):
                    boxes_3d=np.array([], dtype=np.float32),
                    labels_3d=np.array([], dtype=np.int64)))

-    result_str = str()
-    result_str += 'mAP'
    rec, prec, ap = eval_map_recall(dt_annos, gt_infos, metric)
-    ret_dict = {}
+    ret_dict = dict()
+    header = ['classes']
+    table_columns = [[label2cat[label]
+                      for label in ap[0].keys()] + ['Overall']]
+
    for i, iou_thresh in enumerate(metric):
+        header.append(f'AP_{iou_thresh:.2f}')
+        header.append(f'AR_{iou_thresh:.2f}')
        rec_list = []
        for label in ap[i].keys():
            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
                ap[i][label][0])
        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
            np.mean(list(ap[i].values())))
+
+        table_columns.append(list(map(float, list(ap[i].values()))))
+        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
        for label in rec[i].keys():
            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
                rec[i][label][-1])
            rec_list.append(rec[i][label][-1])
        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
+
+        table_columns.append(list(map(float, rec_list)))
+        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
    return ret_dict
--- a/mmdet3d/core/post_processing/__init__.py
+++ b/mmdet3d/core/post_processing/__init__.py
 from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
                                        merge_aug_proposals, merge_aug_scores,
                                        multiclass_nms)
-from .box3d_nms import box3d_multiclass_nms
+from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms

 __all__ = [
    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
-    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms'
+    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
+    'aligned_3d_nms'
 ]
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -64,3 +64,52 @@ def box3d_multiclass_nms(mlvl_bboxes,
        labels = mlvl_scores.new_zeros((0, mlvl_scores.size(-1)))
        dir_scores = mlvl_scores.new_zeros((0, ))
    return bboxes, scores, labels, dir_scores
+
+
+def aligned_3d_nms(boxes, scores, classes, thresh):
+    """3d nms for aligned boxes.
+
+    Args:
+        boxes (Tensor): Aligned box with shape [n, 6].
+        scores (Tensor): Scores of each box.
+        classes (Tensor): Class of each box.
+        thresh (float): Iou threshold for nms.
+
+    Returns:
+        Tensor: Indices of selected boxes.
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    z1 = boxes[:, 2]
+    x2 = boxes[:, 3]
+    y2 = boxes[:, 4]
+    z2 = boxes[:, 5]
+    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+    zero = boxes.new_zeros(1, )
+
+    score_sorted = torch.argsort(scores)
+    pick = []
+    while (score_sorted.shape[0] != 0):
+        last = score_sorted.shape[0]
+        i = score_sorted[-1]
+        pick.append(i)
+
+        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+        classes1 = classes[i]
+        classes2 = classes[score_sorted[:last - 1]]
+        inter_l = torch.max(zero, xx2 - xx1)
+        inter_w = torch.max(zero, yy2 - yy1)
+        inter_h = torch.max(zero, zz2 - zz1)
+
+        inter = inter_l * inter_w * inter_h
+        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+        iou = iou * (classes1 == classes2).float()
+        score_sorted = score_sorted[torch.nonzero(iou <= thresh).flatten()]
+
+    indices = boxes.new_tensor(pick, dtype=torch.long)
+    return indices
--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
@@ -8,8 +8,8 @@ from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
 from .nuscenes_dataset import NuScenesDataset
 from .pipelines import (GlobalRotScale, IndoorFlipData, IndoorGlobalRotScale,
                        IndoorPointSample, IndoorPointsColorJitter,
-                        IndoorPointsColorNormalize, LoadAnnotations3D,
-                        LoadPointsFromFile, ObjectNoise, ObjectRangeFilter,
+                        LoadAnnotations3D, LoadPointsFromFile,
+                        NormalizePointsColor, ObjectNoise, ObjectRangeFilter,
                        ObjectSample, PointShuffle, PointsRangeFilter,
                        RandomFlip3D)
 from .scannet_dataset import ScanNetDataset
@@ -21,7 +21,7 @@ __all__ = [
    'CocoDataset', 'Kitti2DDataset', 'NuScenesDataset', 'ObjectSample',
    'RandomFlip3D', 'ObjectNoise', 'GlobalRotScale', 'PointShuffle',
    'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
-    'LoadPointsFromFile', 'IndoorPointsColorNormalize', 'IndoorPointSample',
+    'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
    'LoadAnnotations3D', 'IndoorPointsColorJitter', 'IndoorGlobalRotScale',
    'IndoorFlipData', 'SUNRGBDDataset', 'ScanNetDataset', 'Custom3DDataset'
 ]
--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -3,7 +3,6 @@ import tempfile

 import mmcv
 import numpy as np
-from mmcv.utils import print_log
 from torch.utils.data import Dataset

 from mmdet.datasets import DATASETS
@@ -19,12 +18,14 @@ class Custom3DDataset(Dataset):
                 pipeline=None,
                 classes=None,
                 modality=None,
+                 filter_empty_gt=True,
                 test_mode=False):
        super().__init__()
        self.data_root = data_root
        self.ann_file = ann_file
        self.test_mode = test_mode
        self.modality = modality
+        self.filter_empty_gt = filter_empty_gt

        self.CLASSES = self.get_classes(classes)
        self.data_infos = self.load_annotations(self.ann_file)
@@ -52,7 +53,7 @@ class Custom3DDataset(Dataset):
        if not self.test_mode:
            annos = self.get_ann_info(index)
            input_dict['ann_info'] = annos
-            if len(annos['gt_bboxes_3d']) == 0:
+            if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
                return None
        return input_dict

@@ -67,7 +68,8 @@ class Custom3DDataset(Dataset):
            return None
        self.pre_pipeline(input_dict)
        example = self.pipeline(input_dict)
-        if example is None or len(example['gt_bboxes_3d']._data) == 0:
+        if self.filter_empty_gt and (example is None or len(
+                example['gt_bboxes_3d']._data) == 0):
            return None
        return example

@@ -124,23 +126,20 @@ class Custom3DDataset(Dataset):
            results (list[dict]): List of results.
            metric (str | list[str]): Metrics to be evaluated.
            iou_thr (list[float]): AP IoU thresholds.
+
        """
        from mmdet3d.core.evaluation import indoor_eval
        assert isinstance(
            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, f'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
        assert isinstance(
            results[0], dict
        ), f'Expect elements in results to be dict, got {type(results[0])}.'
        gt_annos = [info['annos'] for info in self.data_infos]
        label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
-        ret_dict = indoor_eval(gt_annos, results, iou_thr, label2cat)
-
-        result_str = str()
-        for key, val in ret_dict.items():
-            result_str += f'{key} : {val} \n'
-        mAP_25, mAP_50 = ret_dict['mAP_0.25'], ret_dict['mAP_0.50']
-        result_str += f'mAP(0.25): {mAP_25}    mAP(0.50): {mAP_50}'
-        print_log('\n' + result_str, logger=logger)
+        ret_dict = indoor_eval(
+            gt_annos, results, iou_thr, label2cat, logger=logger)

        return ret_dict


--- a/mmdet3d/datasets/pipelines/__init__.py
+++ b/mmdet3d/datasets/pipelines/__init__.py
@@ -3,10 +3,11 @@ from .dbsampler import DataBaseSampler, MMDataBaseSampler
 from .formating import DefaultFormatBundle, DefaultFormatBundle3D
 from .indoor_augment import (IndoorFlipData, IndoorGlobalRotScale,
                             IndoorPointsColorJitter)
-from .indoor_loading import (IndoorPointsColorNormalize, LoadAnnotations3D,
-                             LoadPointsFromFile)
+from .indoor_loading import (LoadAnnotations3D, LoadPointsFromFile,
+                             NormalizePointsColor)
 from .indoor_sample import IndoorPointSample
 from .loading import LoadMultiViewImageFromFiles
+from .point_seg_class_mapping import PointSegClassMapping
 from .train_aug import (GlobalRotScale, ObjectNoise, ObjectRangeFilter,
                        ObjectSample, PointShuffle, PointsRangeFilter,
                        RandomFlip3D)
@@ -17,6 +18,6 @@ __all__ = [
    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
    'IndoorGlobalRotScale', 'IndoorPointsColorJitter', 'IndoorFlipData',
-    'MMDataBaseSampler', 'IndoorPointsColorNormalize', 'LoadAnnotations3D',
-    'IndoorPointSample'
+    'MMDataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D',
+    'IndoorPointSample', 'PointSegClassMapping'
 ]
--- a/mmdet3d/datasets/pipelines/indoor_augment.py
+++ b/mmdet3d/datasets/pipelines/indoor_augment.py
@@ -224,7 +224,7 @@ class IndoorGlobalRotScale(object):
            results['scale_ratio'] = scale_ratio

        results['points'] = points
-        results['gt_bboxes_3d'] = gt_bboxes_3d
+        results['gt_bboxes_3d'] = gt_bboxes_3d.astype(np.float32)
        return results

    def __repr__(self):

--- a/mmdet3d/datasets/pipelines/indoor_loading.py
+++ b/mmdet3d/datasets/pipelines/indoor_loading.py
@@ -6,8 +6,8 @@ from mmdet.datasets.pipelines import LoadAnnotations


 @PIPELINES.register_module()
-class IndoorPointsColorNormalize(object):
-    """Indoor points color normalize
+class NormalizePointsColor(object):
+    """Normalize color of points

    Normalize color of the points.

@@ -45,9 +45,16 @@ class LoadPointsFromFile(object):
        use_dim (list[int]): Which dimensions of the points to be used.
            Default: [0, 1, 2]. For KITTI dataset, set use_dim=4
            or use_dim=[0, 1, 2, 3] to use the intensity dimension
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details.
    """

-    def __init__(self, load_dim=6, use_dim=[0, 1, 2], shift_height=False):
+    def __init__(self,
+                 load_dim=6,
+                 use_dim=[0, 1, 2],
+                 shift_height=False,
+                 file_client_args=dict(backend='disk')):
        self.shift_height = shift_height
        if isinstance(use_dim, int):
            use_dim = list(range(use_dim))
@@ -56,8 +63,16 @@ class LoadPointsFromFile(object):

        self.load_dim = load_dim
        self.use_dim = use_dim
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None

    def _load_points(self, pts_filename):
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            pts_bytes = self.file_client.get(pts_filename)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
            mmcv.check_file_exist(pts_filename)
            if pts_filename.endswith('.npy'):
                points = np.load(pts_filename)
@@ -113,6 +128,9 @@ class LoadAnnotations3D(LoadAnnotations):
            Defaults to False.
        poly2mask (bool, optional): Whether to convert polygon annotations
            to bitmasks. Defaults to True.
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details.
    """

    def __init__(self,
@@ -124,8 +142,15 @@ class LoadAnnotations3D(LoadAnnotations):
                 with_label=False,
                 with_mask=False,
                 with_seg=False,
-                 poly2mask=True):
-        super().__init__(with_bbox, with_label, with_mask, with_seg, poly2mask)
+                 poly2mask=True,
+                 file_client_args=dict(backend='disk')):
+        super().__init__(
+            with_bbox,
+            with_label,
+            with_mask,
+            with_seg,
+            poly2mask,
+            file_client_args=file_client_args)
        self.with_bbox_3d = with_bbox_3d
        self.with_label_3d = with_label_3d
        self.with_mask_3d = with_mask_3d
@@ -142,16 +167,35 @@ class LoadAnnotations3D(LoadAnnotations):

    def _load_masks_3d(self, results):
        pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            mask_bytes = self.file_client.get(pts_instance_mask_path)
+            pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int)
+        except ConnectionError:
            mmcv.check_file_exist(pts_instance_mask_path)
-        pts_instance_mask = np.fromfile(pts_instance_mask_path, dtype=np.long)
+            pts_instance_mask = np.fromfile(
+                pts_instance_mask_path, dtype=np.long)
+
        results['pts_instance_mask'] = pts_instance_mask
        results['pts_mask_fields'].append(results['pts_instance_mask'])
        return results

    def _load_semantic_seg_3d(self, results):
        pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            mask_bytes = self.file_client.get(pts_semantic_mask_path)
+            # add .copy() to fix read-only bug
+            pts_semantic_mask = np.frombuffer(mask_bytes, dtype=np.int).copy()
+        except ConnectionError:
            mmcv.check_file_exist(pts_semantic_mask_path)
-        pts_semantic_mask = np.fromfile(pts_semantic_mask_path, dtype=np.long)
+            pts_semantic_mask = np.fromfile(
+                pts_semantic_mask_path, dtype=np.long)
+
        results['pts_semantic_mask'] = pts_semantic_mask
        results['pts_seg_fields'].append(results['pts_semantic_mask'])
        return results