[Enhance] SECOND benchmark on Waymo (#166)

* Add second config for waymo * Update README.md * Update README.md * Fix a minor typo in the kitti log link

[Enhance] SECOND benchmark on Waymo (#166)
* Add second config for waymo * Update README.md * Update README.md * Fix a minor typo in the kitti log link
f98bc952 · twang · GitHub · e813610f · f98bc952 · f98bc952
Unverified Commit f98bc952 authored Oct 21, 2020 by twang Committed by GitHub Oct 21, 2020
3 changed files
--- a/configs/_base_/models/hv_second_secfpn_waymo.py
+++ b/configs/_base_/models/hv_second_secfpn_waymo.py
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=10,
+        point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+        voxel_size=voxel_size,
+        max_voxels=(60000, 60000)),
+    voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[61, 1280, 1920],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=384,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+                    [-76.8, -51.2, 0, 76.8, 51.2, 0],
+                    [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 0.91, 1.74],  # pedestrian
+                [0.84, 1.81, 1.77]  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
+# model training and testing settings
+train_cfg = dict(
+    assigner=[
+        dict(  # car
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.55,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.4,
+            ignore_iof_thr=-1),
+        dict(  # pedestrian
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        dict(  # cyclist
+            type='MaxIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1)
+    ],
+    allowed_border=0,
+    code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    pos_weight=-1,
+    debug=False)
+
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_pre=4096,
+    nms_thr=0.25,
+    score_thr=0.1,
+    min_bbox_size=0,
+    max_num=500)
--- a/configs/second/README.md
+++ b/configs/second/README.md
@@ -19,4 +19,15 @@ We implement SECOND and provide the results and checkpoints on KITTI dataset.
 |  Backbone   |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP |Download |
 | :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
 |    [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4||79.07|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238.log.json)|
-|    [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4||64.41|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth)|
+|    [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4||64.41|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238.log.json)|
+
+### Waymo
+
+|  Backbone | Load Interval | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 |  mAP@L2 | **mAPH@L2** | Download |
+| :-------: | :-----------: |:-----:| :------:| :------: | :------------: | :----: | :-----: | :-----: | :-----: | :------: |
+| [SECFPN](./hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py)|5|3 Class|2x|8.12||58.0|53.5|51.5|48.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20201003_105141-4cc0bb58.pth) &#124; [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20201003_105141.log.json)|
+| above @ Car|||2x|8.12||58.5|57.9|51.6|51.1| |
+| above @ Pedestrian|||2x|8.12||63.9|54.9|56.0|48.0| |
+| above @ Cyclist|||2x|8.12||48.6|47.6|46.8|45.8| |
+
+Note: See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation.
--- a/configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
+++ b/configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
+_base_ = [
+    '../_base_/models/hv_second_secfpn_waymo.py',
+    '../_base_/datasets/waymoD5-3d-3class.py',
+    '../_base_/schedules/schedule_2x.py',
+    '../_base_/default_runtime.py',
+]
+
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-76.8, -51.2, -2, 76.8, 51.2, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=6, Pedestrian=8, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile', load_dim=5, use_dim=[0, 1, 2, 3, 4]))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', load_dim=6, use_dim=5),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(type='LoadPointsFromFile', load_dim=6, use_dim=5),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))