Feature parta2 roi

885a225b · wuyuefeng · zhangwenwei · 535344de · 885a225b · 885a225b
Commit 885a225b authored May 15, 2020 by wuyuefeng Committed by zhangwenwei May 15, 2020
20 changed files
--- a/configs/kitti/hv_PartA2_secfpn_4x8_cosine_80e_kitti-3d-3class.py
+++ b/configs/kitti/hv_PartA2_secfpn_4x8_cosine_80e_kitti-3d-3class.py
@@ -10,13 +10,16 @@ model = dict(
        voxel_size=voxel_size,
        max_voxels=(16000, 40000)  # (training, testing) max_coxels
    ),
-    voxel_encoder=dict(type='VoxelFeatureExtractorV3'),
+    voxel_encoder=dict(
+        type='VoxelFeatureExtractorV3',
+        num_input_features=4,
+        num_filters=[4],
+        with_distance=False),
    middle_encoder=dict(
        type='SparseUNet',
        in_channels=4,
        output_shape=[41, 1600, 1408],
-        pre_act=False,
-    ),
+        pre_act=False),
    backbone=dict(
        type='SECOND',
        in_channels=256,
@@ -56,8 +59,65 @@ model = dict(
            loss_weight=1.0),
        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
-    ))
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        part_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))))
 # model training and testing settings
 train_cfg = dict(
    rpn=dict(
@@ -82,7 +142,7 @@ train_cfg = dict(
                pos_iou_thr=0.6,
                neg_iou_thr=0.45,
                min_pos_iou=0.45,
-                ignore_iof_thr=-1),
+                ignore_iof_thr=-1)
        ],
        allowed_border=0,
        pos_weight=-1,
@@ -93,24 +153,61 @@ train_cfg = dict(
        nms_thr=0.8,
        score_thr=0,
        use_rotate_nms=False),
-)
+    rcnn=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.55,
+                min_pos_iou=0.55,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.55,
+                min_pos_iou=0.55,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.55,
+                min_pos_iou=0.55,
+                ignore_iof_thr=-1)
+        ],
+        sampler=dict(
+            type='IoUNegPiecewiseSampler',
+            num=128,
+            pos_fraction=0.55,
+            neg_piece_fractions=[0.8, 0.2],
+            neg_iou_piece_thrs=[0.55, 0.1],
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False,
+            return_iou=True),
+        cls_pos_thr=0.75,
+        cls_neg_thr=0.25))
 test_cfg = dict(
    rpn=dict(
        nms_pre=1024,
-        max_per_img=100,
-        use_rotate_nms=True,
-        nms_across_levels=False,
+        nms_post=100,
        nms_thr=0.7,
-        score_thr=0))
+        score_thr=0,
+        use_rotate_nms=True),
+    rcnn=dict(
+        use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.1))

 # dataset settings
 dataset_type = 'KittiDataset'
 data_root = 'data/kitti/'
 class_names = ['Pedestrian', 'Cyclist', 'Car']
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 input_modality = dict(
-    use_lidar=True, use_depth=False, use_lidar_intensity=True, use_camera=True)
+    use_lidar=False,
+    use_lidar_reduced=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False)
 db_sampler = dict(
    root_path=data_root,
    info_path=data_root + 'kitti_dbinfos_train.pkl',
@@ -119,28 +216,34 @@ db_sampler = dict(
    object_rot_range=[0.0, 0.0],
    prepare=dict(
        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
-    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6),
-)
+        filter_by_min_points=dict(
+            Car=5,
+            Pedestrian=10,
+            Cyclist=10,
+        )),
+    sample_groups=dict(
+        Car=12,
+        Pedestrian=6,
+        Cyclist=6,
+    ))
 train_pipeline = [
    dict(type='ObjectSample', db_sampler=db_sampler),
    dict(
        type='ObjectNoise',
        num_try=100,
-        loc_noise_std=[0, 0, 0],
+        loc_noise_std=[1.0, 1.0, 0.5],
        global_rot_range=[0.0, 0.0],
-        rot_uniform_noise=[-0.39269908, 0.39269908]),
+        rot_uniform_noise=[-0.78539816, 0.78539816]),
    dict(type='RandomFlip3D', flip_ratio=0.5),
    dict(
        type='GlobalRotScale',
        rot_uniform_noise=[-0.78539816, 0.78539816],
-        scaling_uniform_noise=[0.95, 1.05],
-        trans_normal_noise=[0.2, 0.2, 0.2]),
+        scaling_uniform_noise=[0.95, 1.05]),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
 ]
 test_pipeline = [
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
@@ -148,7 +251,7 @@ test_pipeline = [
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
-    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d']),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d'])
 ]

 data = dict(
@@ -183,21 +286,19 @@ data = dict(
        class_names=class_names,
        with_label=True))
 # optimizer
-lr = 0.003  # max learning rate
-optimizer = dict(
-    type='AdamW',
-    lr=lr,
-    betas=(0.95, 0.99),  # the momentum is change during training
-    weight_decay=0.001)
+lr = 0.001  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
 optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
 lr_config = dict(
-    policy='cosine',
-    warmup='linear',
-    warmup_iters=1000,
-    warmup_ratio=1.0 / 10,
-    target_lr=1e-5,
-    as_ratio=True)
-momentum_config = None
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4)
 checkpoint_config = dict(interval=1)
 # yapf:disable
 log_config = dict(
@@ -209,8 +310,9 @@ log_config = dict(
 # yapf:enable
 # runtime settings
 total_epochs = 80
-dist_params = dict(backend='nccl', port=29502)
+dist_params = dict(backend='nccl')
 log_level = 'INFO'
+find_unused_parameters = True
 work_dir = './work_dirs/parta2_secfpn_80e'
 load_from = None
 resume_from = None

--- a/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-car.py
+++ b/configs/kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-car.py
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='PartA2',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_coxels
+    ),
+    voxel_encoder=dict(
+        type='VoxelFeatureExtractorV3',
+        num_input_features=4,
+        num_filters=[4],
+        with_distance=False),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        output_shape=[41, 1600, 1408],
+        pre_act=False),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        class_name=['Car'],
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            strides=[2],
+            sizes=[[1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=1,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=1,
+            loss_seg=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        part_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=1,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.45,
+            min_pos_iou=0.45,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_pre=9000,
+        nms_post=512,
+        nms_thr=0.8,
+        score_thr=0,
+        use_rotate_nms=False),
+    rcnn=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+            pos_iou_thr=0.55,
+            neg_iou_thr=0.55,
+            min_pos_iou=0.55,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='IoUNegPiecewiseSampler',
+            num=128,
+            pos_fraction=0.55,
+            neg_piece_fractions=[0.8, 0.2],
+            neg_iou_piece_thrs=[0.55, 0.1],
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False,
+            return_iou=True),
+        cls_pos_thr=0.75,
+        cls_neg_thr=0.25))
+test_cfg = dict(
+    rpn=dict(
+        nms_pre=1024,
+        nms_post=100,
+        nms_thr=0.7,
+        score_thr=0,
+        use_rotate_nms=True),
+    rcnn=dict(
+        use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.1))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+input_modality = dict(
+    use_lidar=False,
+    use_lidar_reduced=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5),
+    ),
+    sample_groups=dict(Car=15))
+train_pipeline = [
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True))
+# optimizer
+lr = 0.001  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4)
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 80
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+find_unused_parameters = True
+work_dir = './work_dirs/parta2_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -8,7 +8,7 @@ from .samplers import (BaseSampler, CombinedSampler,
                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
                       PseudoSampler, RandomSampler, SamplingResult)
 from .structures import Box3DMode, CameraInstance3DBoxes, LiDARInstance3DBoxes
-from .transforms import boxes3d_to_bev_torch_lidar
+from .transforms import bbox3d2roi, boxes3d_to_bev_torch_lidar

 from .assign_sampling import (  # isort:skip, avoid recursive imports
    build_bbox_coder,  # temporally settings
@@ -22,5 +22,5 @@ __all__ = [
    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'boxes3d_to_bev_torch_lidar',
    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
    'bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes',
-    'CameraInstance3DBoxes'
+    'CameraInstance3DBoxes', 'bbox3d2roi'
 ]
--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -566,3 +566,69 @@ def points_in_convex_polygon_jit(points, polygon, clockwise=True):
                    break
            ret[i, j] = success
    return ret
+
+
+def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
+    """convert kitti center boxes to corners
+
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+
+    Args:
+        boxes3d (numpy.array): (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords,
+            see the definition of ry in KITTI dataset
+        bottom_center (bool): whether z is on the bottom center of object.
+
+    Returns:
+        numpy.array: box corners with shape (N, 8, 3)
+    """
+    boxes_num = boxes3d.shape[0]
+    w, l, h = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+    x_corners = np.array(
+        [w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.],
+        dtype=np.float32).T
+    y_corners = np.array(
+        [-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.],
+        dtype=np.float32).T
+    if bottom_center:
+        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+        z_corners[:, 4:8] = h.reshape(boxes_num, 1).repeat(4, axis=1)  # (N, 8)
+    else:
+        z_corners = np.array([
+            -h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2.
+        ],
+                             dtype=np.float32).T
+
+    ry = boxes3d[:, 6]
+    zeros, ones = np.zeros(
+        ry.size, dtype=np.float32), np.ones(
+            ry.size, dtype=np.float32)
+    rot_list = np.array([[np.cos(ry), -np.sin(ry), zeros],
+                         [np.sin(ry), np.cos(ry), zeros], [zeros, zeros,
+                                                           ones]])  # (3, 3, N)
+    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
+
+    temp_corners = np.concatenate((x_corners.reshape(
+        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
+                                  axis=2)  # (N, 8, 3)
+    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+    corners = np.concatenate(
+        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
+        axis=2)
+
+    return corners.astype(np.float32)
--- a/mmdet3d/core/bbox/box_torch_ops.py
+++ b/mmdet3d/core/bbox/box_torch_ops.py
@@ -210,3 +210,70 @@ def enlarge_box3d_lidar(boxes3d, extra_width):
    large_boxes3d[:, 3:6] += extra_width * 2
    large_boxes3d[:, 2] -= extra_width  # bottom center z minus extra_width
    return large_boxes3d
+
+
+def boxes3d_to_corners3d_lidar_torch(boxes3d, bottom_center=True):
+    """convert kitti center boxes to corners
+
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+
+    Args:
+        boxes3d (FloatTensor): (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords,
+            see the definition of ry in KITTI dataset
+        bottom_center (bool): whether z is on the bottom center of object.
+
+    Returns:
+        FloatTensor: box corners with shape (N, 8, 3)
+    """
+    boxes_num = boxes3d.shape[0]
+    w, l, h = boxes3d[:, 3:4], boxes3d[:, 4:5], boxes3d[:, 5:6]
+    ry = boxes3d[:, 6:7]
+
+    zeros = boxes3d.new_zeros(boxes_num, 1)
+    ones = boxes3d.new_ones(boxes_num, 1)
+    x_corners = torch.cat(
+        [w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.],
+        dim=1)  # (N, 8)
+    y_corners = torch.cat(
+        [-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.],
+        dim=1)  # (N, 8)
+    if bottom_center:
+        z_corners = torch.cat([zeros, zeros, zeros, zeros, h, h, h, h],
+                              dim=1)  # (N, 8)
+    else:
+        z_corners = torch.cat([
+            -h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2.
+        ],
+                              dim=1)  # (N, 8)
+    temp_corners = torch.cat(
+        (x_corners.unsqueeze(dim=2), y_corners.unsqueeze(dim=2),
+         z_corners.unsqueeze(dim=2)),
+        dim=2)  # (N, 8, 3)
+
+    cosa, sina = torch.cos(ry), torch.sin(ry)
+    raw_1 = torch.cat([cosa, -sina, zeros], dim=1)  # (N, 3)
+    raw_2 = torch.cat([sina, cosa, zeros], dim=1)  # (N, 3)
+    raw_3 = torch.cat([zeros, zeros, ones], dim=1)  # (N, 3)
+    R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1),
+                   raw_3.unsqueeze(dim=1)),
+                  dim=1)  # (N, 3, 3)
+
+    rotated_corners = torch.matmul(temp_corners, R)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.view(-1, 1) + x_corners.view(-1, 8)
+    y = y_loc.view(-1, 1) + y_corners.view(-1, 8)
+    z = z_loc.view(-1, 1) + z_corners.view(-1, 8)
+    corners = torch.cat((x.view(-1, 8, 1), y.view(-1, 8, 1), z.view(-1, 8, 1)),
+                        dim=2)
+
+    return corners
--- a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
+++ b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
@@ -88,6 +88,11 @@ def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
    assert bboxes1.size(-1) == bboxes2.size(-1) == 7
    assert coordinate in ['camera', 'lidar']

+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if rows * cols == 0:
+        return bboxes1.new(rows, cols)
+
    if coordinate == 'camera':
        return boxes_iou3d_gpu_camera(bboxes1, bboxes2, mode)
    elif coordinate == 'lidar':

--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -47,3 +47,25 @@ def boxes3d_to_bev_torch_lidar(boxes3d):
    boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_w, cv + half_l
    boxes_bev[:, 4] = boxes3d[:, 6]
    return boxes_bev
+
+
+def bbox3d2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, c), [batch_ind, x, y ...]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes], dim=-1)
+        else:
+            rois = torch.zeros_like(bboxes)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
--- a/mmdet3d/core/evaluation/kitti_utils/eval.py
+++ b/mmdet3d/core/evaluation/kitti_utils/eval.py
@@ -731,7 +731,6 @@ def kitti_eval(gt_annos,
            result += 'aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, 0])

        # prepare results for logger
-        ret_dict['Overall'] = dict()
        for idx in range(3):
            postfix = f'{difficulty[idx]}'
            if mAP3d is not None:

--- a/mmdet3d/models/anchor_heads/parta2_rpn_head.py
+++ b/mmdet3d/models/anchor_heads/parta2_rpn_head.py
@@ -231,19 +231,15 @@ class PartA2RPNHead(SECONDHead):
                labels = labels[inds]
                scores = scores[inds]
                cls_scores = cls_scores[inds]
-                dir_scores = dir_scores[inds]
            return dict(
-                box3d_lidar=bboxes.cpu(),
-                scores=scores.cpu(),
-                label_preds=labels.cpu(),
-                cls_preds=cls_scores.cpu(
-                )  # raw scores with shape [max_num, cls_num]
+                box3d_lidar=bboxes,
+                scores=scores,
+                label_preds=labels,
+                cls_preds=cls_scores  # raw scores [max_num, cls_num]
            )
        else:
            return dict(
-                box3d_lidar=mlvl_bboxes.new_zeros([0,
-                                                   self.box_code_size]).cpu(),
-                scores=mlvl_bboxes.new_zeros([0]).cpu(),
-                label_preds=mlvl_bboxes.new_zeros([0]).cpu(),
-                cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]
-                                                 ]).cpu())
+                box3d_lidar=mlvl_bboxes.new_zeros([0, self.box_code_size]),
+                scores=mlvl_bboxes.new_zeros([0]),
+                label_preds=mlvl_bboxes.new_zeros([0]),
+                cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]]))
--- a/mmdet3d/models/anchor_heads/second_head.py
+++ b/mmdet3d/models/anchor_heads/second_head.py
@@ -258,9 +258,9 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
            dir_weights_list,
            num_total_samples=num_total_samples)
        return dict(
-            loss_cls_3d=losses_cls,
-            loss_bbox_3d=losses_bbox,
-            loss_dir_3d=losses_dir)
+            loss_rpn_cls=losses_cls,
+            loss_rpn_bbox=losses_bbox,
+            loss_rpn_dir=losses_dir)

    def get_bboxes(self,
                   cls_scores,

--- a/mmdet3d/models/detectors/parta2.py
+++ b/mmdet3d/models/detectors/parta2.py
@@ -34,11 +34,13 @@ class PartA2(TwoStageDetector):
        self.middle_encoder = builder.build_middle_encoder(middle_encoder)

    def extract_feat(self, points, img_meta):
-        voxels, num_points, coors = self.voxelize(points)
-        voxel_dict = dict(voxels=voxels, num_points=num_points, coors=coors)
-        voxel_features = self.voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0].item() + 1
-        feats_dict = self.middle_encoder(voxel_features, coors, batch_size)
+        voxel_dict = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
+                                            voxel_dict['num_points'],
+                                            voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'],
+                                         batch_size)
        x = self.backbone(feats_dict['spatial_features'])
        if self.with_neck:
            neck_feats = self.neck(x)
@@ -47,20 +49,33 @@ class PartA2(TwoStageDetector):

    @torch.no_grad()
    def voxelize(self, points):
-        voxels, coors, num_points = [], [], []
+        voxels, coors, num_points, voxel_centers = [], [], [], []
        for res in points:
            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+            res_voxel_centers = (
+                res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                    self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                        self.voxel_layer.point_cloud_range[0:3])
            voxels.append(res_voxels)
            coors.append(res_coors)
            num_points.append(res_num_points)
+            voxel_centers.append(res_voxel_centers)
+
        voxels = torch.cat(voxels, dim=0)
        num_points = torch.cat(num_points, dim=0)
+        voxel_centers = torch.cat(voxel_centers, dim=0)
        coors_batch = []
        for i, coor in enumerate(coors):
            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
            coors_batch.append(coor_pad)
        coors_batch = torch.cat(coors_batch, dim=0)
-        return voxels, num_points, coors_batch
+
+        voxel_dict = dict(
+            voxels=voxels,
+            num_points=num_points,
+            coors=coors_batch,
+            voxel_centers=voxel_centers)
+        return voxel_dict

    def forward_train(self,
                      points,
@@ -69,7 +84,6 @@ class PartA2(TwoStageDetector):
                      gt_labels_3d,
                      gt_bboxes_ignore=None,
                      proposals=None):
-        # TODO: complete it
        feats_dict, voxels_dict = self.extract_feat(points, img_meta)

        losses = dict()
@@ -86,7 +100,13 @@ class PartA2(TwoStageDetector):
            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
        else:
-            proposal_list = proposals  # noqa: F841
+            proposal_list = proposals
+
+        roi_losses = self.roi_head.forward_train(feats_dict, voxels_dict,
+                                                 img_meta, proposal_list,
+                                                 gt_bboxes_3d, gt_labels_3d)
+
+        losses.update(roi_losses)

        return losses

@@ -102,16 +122,18 @@ class PartA2(TwoStageDetector):
    def simple_test(self,
                    points,
                    img_meta,
-                    gt_bboxes_3d=None,
+                    gt_bboxes_3d,
                    proposals=None,
                    rescale=False):
        feats_dict, voxels_dict = self.extract_feat(points, img_meta)
-        # TODO: complete it
-        if proposals is None:
-            proposal_list = self.simple_test_rpn(feats_dict['neck_feats'],
-                                                 img_meta, self.test_cfg.rpn)
+
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(feats_dict['neck_feats'])
+            proposal_cfg = self.test_cfg.rpn
+            bbox_inputs = rpn_outs + (img_meta, proposal_cfg)
+            proposal_list = self.rpn_head.get_bboxes(*bbox_inputs)
        else:
            proposal_list = proposals

-        return self.roi_head.simple_test(
-            feats_dict, proposal_list, img_meta, rescale=rescale)
+        return self.roi_head.simple_test(feats_dict, voxels_dict, img_meta,
+                                         proposal_list)
--- a/mmdet3d/models/roi_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/__init__.py
+from .base_3droi_head import Base3DRoIHead
+from .bbox_heads import PartA2BboxHead
 from .mask_heads import PointwiseSemanticHead
+from .part_aggregation_roi_head import PartAggregationROIHead
+from .roi_extractors import Single3DRoIAwareExtractor

-__all__ = ['PointwiseSemanticHead']
+__all__ = [
+    'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
+    'Single3DRoIAwareExtractor', 'PartA2BboxHead'
+]
--- a/mmdet3d/models/roi_heads/base_3droi_head.py
+++ b/mmdet3d/models/roi_heads/base_3droi_head.py
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+
+class Base3DRoIHead(nn.Module, metaclass=ABCMeta):
+    """Base class for 3d RoIHeads"""
+
+    def __init__(self,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super(Base3DRoIHead, self).__init__()
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self):
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @abstractmethod
+    def init_weights(self, pretrained):
+        pass
+
+    @abstractmethod
+    def init_bbox_head(self):
+        pass
+
+    @abstractmethod
+    def init_mask_head(self):
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self):
+        pass
+
+    @abstractmethod
+    def forward_train(self,
+                      x,
+                      img_meta,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      **kwargs):
+        """Forward function during training"""
+        pass
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_meta,
+                    proposals=None,
+                    rescale=False,
+                    **kwargs):
+        """Test without augmentation."""
+        pass
+
+    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        pass
--- a/mmdet3d/models/roi_heads/bbox_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/__init__.py
@@ -2,8 +2,9 @@ from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead,
                                               DoubleConvFCBBoxHead,
                                               Shared2FCBBoxHead,
                                               Shared4Conv1FCBBoxHead)
+from .parta2_bbox_head import PartA2BboxHead

 __all__ = [
    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
-    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead'
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead'
 ]
--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
--- a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
@@ -126,22 +126,21 @@ class PointwiseSemanticHead(nn.Module):
        part_targets = torch.cat(part_targets, dim=0)
        return dict(seg_targets=seg_targets, part_targets=part_targets)

-    def loss(self, seg_preds, part_preds, seg_targets, part_targets):
+    def loss(self, semantic_results, semantic_targets):
        """Calculate point-wise segmentation and part prediction losses.

        Args:
-            seg_preds (torch.Tensor): prediction of binary
-                segmentation with shape [voxel_num, 1].
-            part_preds (torch.Tensor): prediction of part
-                with shape [voxel_num, 3].
-            seg_targets (torch.Tensor): target of segmentation
-                with shape [voxel_num, 1].
-            part_targets (torch.Tensor): target of part with
-                shape [voxel_num, 3].
+            semantic_results (dict): Results from semantic head.
+            semantic_targets (dict): Targets of semantic results.

        Returns:
            dict: loss of segmentation and part prediction.
        """
+        seg_preds = semantic_results['seg_preds']
+        part_preds = semantic_results['part_preds']
+        seg_targets = semantic_targets['seg_targets']
+        part_targets = semantic_targets['part_targets']
+
        pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes)
        binary_seg_target = pos_mask.long()
        pos = pos_mask.float()

--- a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
+++ b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
+import torch.nn.functional as F
+
+from mmdet3d.core import AssignResult
+from mmdet3d.core.bbox import bbox3d2roi
+from mmdet.core import build_assigner, build_sampler
+from mmdet.models import HEADS
+from ..builder import build_head, build_roi_extractor
+from .base_3droi_head import Base3DRoIHead
+
+
+@HEADS.register_module
+class PartAggregationROIHead(Base3DRoIHead):
+    """Part aggregation roi head for PartA2"""
+
+    def __init__(self,
+                 semantic_head,
+                 num_classes=3,
+                 seg_roi_extractor=None,
+                 part_roi_extractor=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super(PartAggregationROIHead, self).__init__(
+            bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg)
+        self.num_classes = num_classes
+        assert semantic_head is not None
+        self.semantic_head = build_head(semantic_head)
+
+        if seg_roi_extractor is not None:
+            self.seg_roi_extractor = build_roi_extractor(seg_roi_extractor)
+        if part_roi_extractor is not None:
+            self.part_roi_extractor = build_roi_extractor(part_roi_extractor)
+
+        self.init_assigner_sampler()
+
+    def init_weights(self, pretrained):
+        pass
+
+    def init_mask_head(self):
+        pass
+
+    def init_bbox_head(self, bbox_head):
+        self.bbox_head = build_head(bbox_head)
+
+    def init_assigner_sampler(self):
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    build_assigner(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
+
+    @property
+    def with_semantic(self):
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def forward_train(self, feats_dict, voxels_dict, img_meta, proposal_list,
+                      gt_bboxes_3d, gt_labels_3d):
+        """Training forward function of PartAggregationROIHead
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxels_dict (dict): Contains information of voxels.
+            img_metas (list[dict]): Meta info of each image.
+            proposal_list (list[dict]): Proposal information from rpn.
+            gt_bboxes_3d (list[FloatTensor]): GT bboxes of each batch.
+            gt_labels_3d (list[LongTensor]): GT labels of each batch.
+
+        Returns:
+            dict: losses from each head.
+        """
+        losses = dict()
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict['seg_features'], voxels_dict, gt_bboxes_3d,
+                gt_labels_3d)
+            losses.update(semantic_results['loss_semantic'])
+
+        sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
+                                                 gt_labels_3d)
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(
+                feats_dict['seg_features'], semantic_results['part_feats'],
+                voxels_dict, sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def simple_test(self, feats_dict, voxels_dict, img_meta, proposal_list,
+                    **kwargs):
+        """Simple testing forward function of PartAggregationROIHead
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxels_dict (dict): Contains information of voxels.
+            img_metas (list[dict]): Meta info of each image.
+            proposal_list (list[dict]): Proposal information from rpn.
+
+        Returns:
+            list[dict]: Bbox results of each batch.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        assert self.with_semantic
+
+        semantic_results = self.semantic_head(feats_dict['seg_features'])
+
+        rois = bbox3d2roi([res['box3d_lidar'] for res in proposal_list])
+        label_preds = [res['label_preds'] for res in proposal_list]
+        cls_preds = [res['cls_preds'] for res in proposal_list]
+        bbox_results = self._bbox_forward(feats_dict['seg_features'],
+                                          semantic_results['part_feats'],
+                                          voxels_dict, rois)
+
+        bbox_list = self.bbox_head.get_bboxes(
+            rois,
+            bbox_results['cls_score'],
+            bbox_results['bbox_pred'],
+            label_preds,
+            cls_preds,
+            img_meta,
+            cfg=self.test_cfg)
+        return bbox_list
+
+    def _bbox_forward_train(self, seg_feats, part_feats, voxels_dict,
+                            sampling_results):
+        rois = bbox3d2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(seg_feats, part_feats, voxels_dict,
+                                          rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _bbox_forward(self, seg_feats, part_feats, voxels_dict, rois):
+        pooled_seg_feats = self.seg_roi_extractor(seg_feats,
+                                                  voxels_dict['voxel_centers'],
+                                                  voxels_dict['coors'][..., 0],
+                                                  rois)
+        pooled_part_feats = self.part_roi_extractor(
+            part_feats, voxels_dict['voxel_centers'],
+            voxels_dict['coors'][..., 0], rois)
+        cls_score, bbox_pred = self.bbox_head(pooled_seg_feats,
+                                              pooled_part_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            pooled_seg_feats=pooled_seg_feats,
+            pooled_part_feats=pooled_part_feats)
+        return bbox_results
+
+    def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
+        sampling_results = []
+        # bbox assign
+        for batch_idx in range(len(proposal_list)):
+            cur_proposal_list = proposal_list[batch_idx]
+            cur_boxes = cur_proposal_list['box3d_lidar']
+            cur_label_preds = cur_proposal_list['label_preds']
+            cur_gt_bboxes = gt_bboxes_3d[batch_idx]
+            cur_gt_labels = gt_labels_3d[batch_idx]
+
+            batch_num_gts = 0
+            batch_gt_indis = cur_gt_labels.new_full((cur_boxes.shape[0], ),
+                                                    0)  # 0 is bg
+            batch_max_overlaps = cur_boxes.new_zeros(cur_boxes.shape[0])
+            batch_gt_labels = cur_gt_labels.new_full((cur_boxes.shape[0], ),
+                                                     -1)  # -1 is bg
+            if isinstance(self.bbox_assigner, list):  # for multi classes
+                for i, assigner in enumerate(self.bbox_assigner):
+                    gt_per_cls = (cur_gt_labels == i)
+                    pred_per_cls = (cur_label_preds == i)
+                    cur_assign_res = assigner.assign(
+                        cur_boxes[pred_per_cls],
+                        cur_gt_bboxes[gt_per_cls],
+                        gt_labels=cur_gt_labels[gt_per_cls])
+                    # gather assign_results in different class into one result
+                    batch_num_gts += cur_assign_res.num_gts
+                    # gt inds (1-based)
+                    gt_inds_arange_pad = gt_per_cls.nonzero().view(-1) + 1
+                    # pad 0 for indice unassigned
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
+                    # pad -1 for indice ignore
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
+                    # convert to 0~gt_num+2 for indices
+                    gt_inds_arange_pad += 1
+                    # now 0 is bg, >1 is fg in batch_gt_indis
+                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
+                        cur_assign_res.gt_inds + 1] - 1
+                    batch_max_overlaps[
+                        pred_per_cls] = cur_assign_res.max_overlaps
+                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
+
+                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
+                                             batch_max_overlaps,
+                                             batch_gt_labels)
+            else:  # for single class
+                assign_result = self.bbox_assigner.assign(
+                    cur_boxes, cur_gt_bboxes, gt_labels=cur_gt_labels)
+            # sample boxes
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       cur_boxes,
+                                                       cur_gt_bboxes,
+                                                       cur_gt_labels)
+            sampling_results.append(sampling_result)
+        return sampling_results
+
+    def _semantic_forward_train(self, x, voxels_dict, gt_bboxes_3d,
+                                gt_labels_3d):
+        semantic_results = self.semantic_head(x)
+        semantic_targets = self.semantic_head.get_targets(
+            voxels_dict, gt_bboxes_3d, gt_labels_3d)
+        loss_semantic = self.semantic_head.loss(semantic_results,
+                                                semantic_targets)
+        semantic_results.update(loss_semantic=loss_semantic)
+        return semantic_results
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -61,6 +61,9 @@ def test_config_build_detector():
            assert detector.roi_head.with_mask == detector.with_mask

            head_config = config_mod.model['roi_head']
+            if head_config.type == 'PartAggregationROIHead':
+                check_parta2_roi_head(head_config, detector.roi_head)
+            else:
                _check_roi_head(head_config, detector.roi_head)
        # else:
        #     # for single stage detector
@@ -319,3 +322,44 @@ def _check_bbox_head(bbox_cfg, bbox_head):
            out_dim = (4 if bbox_cfg.reg_class_agnostic else 4 *
                       bbox_cfg.num_classes)
            assert bbox_head.fc_reg.out_features == out_dim
+
+
+def check_parta2_roi_head(config, head):
+    assert config['type'] == head.__class__.__name__
+
+    # check seg_roi_extractor
+    seg_roi_cfg = config.seg_roi_extractor
+    seg_roi_extractor = head.seg_roi_extractor
+    _check_parta2_roi_extractor(seg_roi_cfg, seg_roi_extractor)
+
+    # check part_roi_extractor
+    part_roi_cfg = config.part_roi_extractor
+    part_roi_extractor = head.part_roi_extractor
+    _check_parta2_roi_extractor(part_roi_cfg, part_roi_extractor)
+
+    # check bbox head infos
+    bbox_cfg = config.bbox_head
+    bbox_head = head.bbox_head
+    _check_parta2_bbox_head(bbox_cfg, bbox_head)
+
+
+def _check_parta2_roi_extractor(config, roi_extractor):
+    assert config['type'] == roi_extractor.__class__.__name__
+    assert (config.roi_layer.out_size == roi_extractor.roi_layer.out_size)
+    assert (config.roi_layer.max_pts_per_voxel ==
+            roi_extractor.roi_layer.max_pts_per_voxel)
+
+
+def _check_parta2_bbox_head(bbox_cfg, bbox_head):
+    import torch.nn as nn
+    if isinstance(bbox_cfg, list):
+        for single_bbox_cfg, single_bbox_head in zip(bbox_cfg, bbox_head):
+            _check_bbox_head(single_bbox_cfg, single_bbox_head)
+    elif isinstance(bbox_head, nn.ModuleList):
+        for single_bbox_head in bbox_head:
+            _check_bbox_head(bbox_cfg, single_bbox_head)
+    else:
+        assert bbox_cfg['type'] == bbox_head.__class__.__name__
+        assert bbox_cfg.seg_in_channels == bbox_head.seg_conv[0][0].in_channels
+        assert bbox_cfg.part_in_channels == bbox_head.part_conv[0][
+            0].in_channels
--- a/tests/test_heads.py
+++ b/tests/test_heads.py
@@ -103,18 +103,18 @@ def test_second_head_loss():

    losses = self.loss(cls_score, bbox_pred, dir_cls_preds, gt_bboxes,
                       gt_labels, input_metas)
-    assert losses['loss_cls_3d'][0] > 0
-    assert losses['loss_bbox_3d'][0] > 0
-    assert losses['loss_dir_3d'][0] > 0
+    assert losses['loss_rpn_cls'][0] > 0
+    assert losses['loss_rpn_bbox'][0] > 0
+    assert losses['loss_rpn_dir'][0] > 0

    # test empty ground truth case
    gt_bboxes = list(torch.empty((2, 0, 7)).cuda())
    gt_labels = list(torch.empty((2, 0)).cuda())
    empty_gt_losses = self.loss(cls_score, bbox_pred, dir_cls_preds, gt_bboxes,
                                gt_labels, input_metas)
-    assert empty_gt_losses['loss_cls_3d'][0] > 0
-    assert empty_gt_losses['loss_bbox_3d'][0] == 0
-    assert empty_gt_losses['loss_dir_3d'][0] == 0
+    assert empty_gt_losses['loss_rpn_cls'][0] > 0
+    assert empty_gt_losses['loss_rpn_bbox'][0] == 0
+    assert empty_gt_losses['loss_rpn_dir'][0] == 0


 def test_second_head_getboxes():
@@ -147,7 +147,7 @@ def test_parta2_rpnhead_getboxes():
    if not torch.cuda.is_available():
        pytest.skip('test requires GPU and torch+cuda')
    rpn_head_cfg, proposal_cfg = _get_rpn_head_cfg(
-        'kitti/hv_PartA2_secfpn_4x8_cosine_80e_kitti-3d-3class.py')
+        'kitti/hv_PartA2_secfpn_4x8_cyclic_80e_kitti-3d-3class.py')

    from mmdet3d.models.builder import build_head
    self = build_head(rpn_head_cfg)

--- a/tests/test_semantic_heads.py
+++ b/tests/test_semantic_heads.py
@@ -62,9 +62,7 @@ def test_PointwiseSemanticHead():
        [voxel_features.shape[0], 3])

    # test loss
-    loss_dict = self.loss(feats_dict['seg_preds'], feats_dict['part_preds'],
-                          target_dict['seg_targets'],
-                          target_dict['part_targets'])
+    loss_dict = self.loss(feats_dict, target_dict)
    assert loss_dict['loss_seg'] > 0
    assert loss_dict['loss_part'] == 0  # no points in gt_boxes
    total_loss = loss_dict['loss_seg'] + loss_dict['loss_part']