[Fix] Fix some loading bugs and support fov_image_based mode in Waymo dataset. (#1942)

* modify sample_id to sample_id and support fov_image_based on waymo dataset * Update waymo_metric.py * Minor fix * Minor fix * Minor fix * Minor fix * Minor fix * Minor fix * Minor fixes * Minor fixes * Remove optional * fix dataset instances converting bugs * Add a blank line to fix the doc compilation format * Fix the bin file name in waymo_fov config * Resolve conflicts * fix ci and other things Co-authored-by: Tai-Wang <tab_wang@outlook.com> Co-authored-by: lianqing11 <lianqing11@foxmail.com> Co-authored-by: ChaimZhu <zhuchenming@pjlab.org.cn>

[Fix] Fix some loading bugs and support fov_image_based mode in Waymo dataset. (#1942)
* modify sample_id to sample_id and support fov_image_based on waymo dataset * Update waymo_metric.py * Minor fix * Minor fix * Minor fix * Minor fix * Minor fix * Minor fix * Minor fixes * Minor fixes * Remove optional * fix dataset instances converting bugs * Add a blank line to fix the doc compilation format * Fix the bin file name in waymo_fov config * Resolve conflicts * fix ci and other things Co-authored-by: Tai-Wang <tab_wang@outlook.com> Co-authored-by: lianqing11 <lianqing11@foxmail.com> Co-authored-by: ChaimZhu <zhuchenming@pjlab.org.cn>
ede10946 · Qing Lian · ZwwWayne · 17ac0691 · ede10946 · ede10946
Commit ede10946 authored Nov 23, 2022 by Qing Lian Committed by ZwwWayne Dec 03, 2022
11 changed files
--- a/configs/_base_/datasets/kitti-mono3d.py
+++ b/configs/_base_/datasets/kitti-mono3d.py
@@ -52,6 +52,7 @@ train_dataloader = dict(
        data_prefix=dict(img='training/image_2'),
        pipeline=train_pipeline,
        modality=input_modality,
+        load_type='fov_image_based',
        test_mode=False,
        metainfo=metainfo,
        # we use box_type_3d='Camera' in monocular 3d
@@ -70,6 +71,7 @@ val_dataloader = dict(
        ann_file='kitti_infos_val.pkl',
        pipeline=test_pipeline,
        modality=input_modality,
+        load_type='fov_image_based',
        metainfo=metainfo,
        test_mode=True,
        box_type_3d='Camera'))

--- a/configs/_base_/datasets/nus-mono3d.py
+++ b/configs/_base_/datasets/nus-mono3d.py
@@ -65,7 +65,7 @@ train_dataloader = dict(
            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
        ann_file='nuscenes_infos_train.pkl',
-        task='mono_det',
+        load_type='mv_image_based',
        pipeline=train_pipeline,
        metainfo=metainfo,
        modality=input_modality,
@@ -92,7 +92,7 @@ val_dataloader = dict(
            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
        ann_file='nuscenes_infos_val.pkl',
-        task='mono_det',
+        load_type='mv_image_based',
        pipeline=test_pipeline,
        modality=input_modality,
        metainfo=metainfo,

--- a/configs/_base_/datasets/waymoD5-fov-mono3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-fov-mono3d-3class.py
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1284, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+
+metainfo = dict(CLASSES=class_names)
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=5))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+    ))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+    ))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+)
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/waymoD5-mono3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-mono3d-3class.py
@@ -81,7 +81,7 @@ train_dataloader = dict(
        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='Camera',
-        task='mono_det',
+        load_type='mv_image_based',
        # load one frame every three frames
        load_interval=5))

@@ -109,7 +109,7 @@ val_dataloader = dict(
        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='Camera',
-        task='mono_det',
+        load_type='mv_image_based',
    ))

 test_dataloader = dict(
@@ -136,7 +136,7 @@ test_dataloader = dict(
        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='Camera',
-        task='mono_det',
+        load_type='mv_image_based',
    ))

 val_evaluator = dict(
@@ -145,5 +145,6 @@ val_evaluator = dict(
    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
    data_root='./data/waymo/waymo_format',
    metric='LET_mAP',
-    task='mono_det')
+    load_type='mv_image_based',
+)
 test_evaluator = val_evaluator
--- a/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
+++ b/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
+_base_ = [
+    '../_base_/datasets/waymoD5-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (4.73, 1.77, 2.08),
+                (0.91, 1.74, 0.84),
+                (1.81, 1.77, 0.84),
+            ),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+total_epochs = 24
+runner = dict(max_epochs=total_epochs)
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
--- a/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py
+++ b/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py
+_base_ = [
+    '../_base_/datasets/waymoD5-mv-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (4.73, 1.77, 2.08),
+                (0.91, 1.74, 0.84),
+                (1.81, 1.77, 0.84),
+            ),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+total_epochs = 24
+runner = dict(max_epochs=total_epochs)
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -32,6 +32,15 @@ class KittiDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
        filter_empty_gt (bool): Whether to filter the data with empty GT.
            If it's set to be True, the example with empty annotations after
            data pipeline will be dropped and a random example will be chosen
@@ -54,7 +63,7 @@ class KittiDataset(Det3DDataset):
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True),
                 default_cam_key: str = 'CAM2',
-                 task: str = 'lidar_det',
+                 load_type: str = 'frame_based',
                 box_type_3d: str = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
@@ -62,8 +71,9 @@ class KittiDataset(Det3DDataset):
                 **kwargs) -> None:

        self.pcd_limit_range = pcd_limit_range
-        assert task in ('lidar_det', 'mono_det')
-        self.task = task
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
@@ -113,7 +123,7 @@ class KittiDataset(Det3DDataset):

            info['plane'] = plane_lidar

-        if self.task == 'mono_det' and self.load_eval_anns:
+        if self.load_type == 'fov_image_based' and self.load_eval_anns:
            info['instances'] = info['cam_instances'][self.default_cam_key]

        info = super().parse_data_info(info)
@@ -144,7 +154,7 @@ class KittiDataset(Det3DDataset):
            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)

-            if self.task == 'mono_det':
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)

--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -22,7 +22,6 @@ class NuScenesDataset(Det3DDataset):
    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
-        task (str): Detection task. Defaults to 'lidar_det'.
        pipeline (list[dict]): Pipeline used for data processing.
            Defaults to [].
        box_type_3d (str): Type of 3D box of this dataset.
@@ -33,6 +32,15 @@ class NuScenesDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
        modality (dict): Modality to specify the sensor data used as input.
            Defaults to dict(use_camera=False, use_lidar=True).
        filter_empty_gt (bool): Whether to filter the data with empty GT.
@@ -58,9 +66,9 @@ class NuScenesDataset(Det3DDataset):
    def __init__(self,
                 data_root: str,
                 ann_file: str,
-                 task: str = 'lidar_det',
                 pipeline: List[Union[dict, Callable]] = [],
                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
                 modality: dict = dict(
                     use_camera=False,
                     use_lidar=True,
@@ -74,8 +82,9 @@ class NuScenesDataset(Det3DDataset):
        self.with_velocity = with_velocity

        # TODO: Redesign multi-view data process in the future
-        assert task in ('lidar_det', 'mono_det', 'multi-view_det')
-        self.task = task
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type

        assert box_type_3d.lower() in ('lidar', 'camera')
        super().__init__(
@@ -144,7 +153,7 @@ class NuScenesDataset(Det3DDataset):
                ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)

-            if self.task == 'mono3d':
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
                ann_info['attr_labels'] = np.array(0, dtype=np.int64)
@@ -154,7 +163,7 @@ class NuScenesDataset(Det3DDataset):
        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
        # the same as KITTI (0.5, 0.5, 0)
        # TODO: Unify the coordinates
-        if self.task == 'mono_det':
+        if self.load_type in ['fov_image_based', 'mv_image_based']:
            gt_bboxes_3d = CameraInstance3DBoxes(
                ann_info['gt_bboxes_3d'],
                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
@@ -182,7 +191,7 @@ class NuScenesDataset(Det3DDataset):
            dict: Has `ann_info` in training stage. And
            all path has been converted to absolute path.
        """
-        if self.task == 'mono_det':
+        if self.load_type == 'mv_image_based':
            data_list = []
            if self.modality['use_lidar']:
                info['lidar_points']['lidar_path'] = \

--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -45,6 +45,15 @@ class WaymoDataset(KittiDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
        filter_empty_gt (bool): Whether to filter the data with empty GT.
            If it's set to be True, the example with empty annotations after
            data pipeline will be dropped and a random example will be chosen
@@ -57,10 +66,6 @@ class WaymoDataset(KittiDataset):
        cam_sync_instances (bool): If use the camera sync label
            supported from waymo version 1.3.1. Defaults to False.
        load_interval (int): load frame interval. Defaults to 1.
-        task (str): task for 3D detection (lidar, mono3d).
-            lidar: take all the ground trurh in the frame.
-            mono3d: take the groundtruth that can be seen in the cam.
-            Defaults to 'lidar_det'.
        max_sweeps (int): max sweep for each frame. Defaults to 0.
    """
    METAINFO = {'classes': ('Car', 'Pedestrian', 'Cyclist')}
@@ -79,12 +84,12 @@ class WaymoDataset(KittiDataset):
                 modality: dict = dict(use_lidar=True),
                 default_cam_key: str = 'CAM_FRONT',
                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
                 cam_sync_instances: bool = False,
                 load_interval: int = 1,
-                 task: str = 'lidar_det',
                 max_sweeps: int = 0,
                 **kwargs) -> None:
        self.load_interval = load_interval
@@ -108,7 +113,7 @@ class WaymoDataset(KittiDataset):
            default_cam_key=default_cam_key,
            data_prefix=data_prefix,
            test_mode=test_mode,
-            task=task,
+            load_type=load_type,
            **kwargs)

    def parse_ann_info(self, info: dict) -> dict:
@@ -151,7 +156,7 @@ class WaymoDataset(KittiDataset):
            centers_2d = np.zeros((0, 2), dtype=np.float32)
            depths = np.zeros((0), dtype=np.float32)

-        if self.task == 'mono_det':
+        if self.load_type in ['fov_image_based', 'mv_image_based']:
            gt_bboxes_3d = CameraInstance3DBoxes(
                ann_info['gt_bboxes_3d'],
                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
@@ -186,10 +191,19 @@ class WaymoDataset(KittiDataset):
    def parse_data_info(self, info: dict) -> dict:
        """if task is lidar or multiview det, use super() method elif task is
        mono3d, split the info from frame-wise to img-wise."""
-        if self.task != 'mono_det':
+
        if self.cam_sync_instances:
-                # use the cam sync labels
            info['instances'] = info['cam_sync_instances']
+
+        if self.load_type == 'frame_based':
+            return super().parse_data_info(info)
+        elif self.load_type == 'fov_image_based':
+            # only loading the fov image and the fov instance
+            new_image_info = {}
+            new_image_info[self.default_cam_key] = \
+                info['images'][self.default_cam_key]
+            info['images'] = new_image_info
+            info['instances'] = info['cam_instances'][self.default_cam_key]
            return super().parse_data_info(info)
        else:
            # in the mono3d, the instances is from cam sync.
@@ -222,7 +236,7 @@ class WaymoDataset(KittiDataset):

                # TODO check if need to modify the sample id
                # TODO check when will use it except for evaluation.
-                camera_info['sample_id'] = info['sample_id']
+                camera_info['sample_idx'] = info['sample_idx']

                if not self.test_mode:
                    # used in training

--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -45,8 +45,15 @@ class WaymoMetric(KittiMetric):
        submission_prefix (str, optional): The prefix of submission data.
            If not specified, the submission data will not be generated.
            Default: None.
-        task: (str, optional): task for 3D detection, if cam, would filter
-            the points that outside the image.
+        load_type (str, optional): Type of loading mode during training.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_base': Only load the instances inside the default cam,
+                and need to convert to the FOV-based data type to support
+                image-based detector.
        default_cam_key (str, optional): The default camera for lidar to
            camear conversion. By default, KITTI: CAM2, Waymo: CAM_FRONT
        use_pred_sample_idx (bool, optional): In formating results, use the
@@ -76,7 +83,7 @@ class WaymoMetric(KittiMetric):
                 prefix: Optional[str] = None,
                 pklfile_prefix: str = None,
                 submission_prefix: str = None,
-                 task='lidar_det',
+                 load_type: str = 'frame_based',
                 default_cam_key: str = 'CAM_FRONT',
                 use_pred_sample_idx: bool = False,
                 collect_device: str = 'cpu',
@@ -85,7 +92,7 @@ class WaymoMetric(KittiMetric):
        self.waymo_bin_file = waymo_bin_file
        self.data_root = data_root
        self.split = split
-        self.task = task
+        self.load_type = load_type
        self.use_pred_sample_idx = use_pred_sample_idx
        self.convert_kitti_format = convert_kitti_format

@@ -124,8 +131,8 @@ class WaymoMetric(KittiMetric):
        assert len(results) == len(self.data_infos), \
            'invalid list length of network outputs'
        # different from kitti, waymo do not need to convert the ann file
-        # handle the mono3d task
-        if self.task == 'mono_det':
+        # handle the mv_image_based load_mode
+        if self.load_type == 'mv_image_based':
            new_data_infos = []
            for info in self.data_infos:
                height = info['images'][self.default_cam_key]['height']
@@ -425,7 +432,7 @@ class WaymoMetric(KittiMetric):
        lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
        lidar2cam = np.array(lidar2cam).astype(np.float32)
        box_preds_camera = box_preds_lidar.convert_to(
-            Box3DMode.CAM, np.linalg.inv(lidar2cam), correct_yaw=True)
+            Box3DMode.CAM, lidar2cam, correct_yaw=True)
        # Note: bbox is meaningless in final evaluation, set to 0
        merged_box_dict = dict(
            bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
@@ -470,7 +477,7 @@ class WaymoMetric(KittiMetric):
            sample_idx = sample_id_list[idx]
            info = self.data_infos[sample_idx]

-            if self.task == 'mono_det':
+            if self.load_type == 'mv_image_based':
                if idx % self.num_cams == 0:
                    box_dict_per_frame = []
                    cam0_key = list(info['images'].keys())[0]
@@ -487,7 +494,7 @@ class WaymoMetric(KittiMetric):
                # If you want to use another camera, please modify it.
                image_shape = (info['images'][self.default_cam_key]['height'],
                               info['images'][self.default_cam_key]['width'])
-            if self.task == 'mono_det':
+            if self.load_type == 'mv_image_based':
                box_dict_per_frame.append(box_dict)
                if (idx + 1) % self.num_cams != 0:
                    continue
@@ -587,7 +594,7 @@ class WaymoMetric(KittiMetric):

    def convert_valid_bboxes(self, box_dict: dict, info: dict):
        """Convert the predicted boxes into valid ones. Should handle the
-        different task mode (mono3d, mv3d, lidar), separately.
+        load_model (frame_based, mv_image_based, fov_image_based), separately.

        Args:
            box_dict (dict): Box dictionaries to be converted.
@@ -624,11 +631,11 @@ class WaymoMetric(KittiMetric):
                scores=np.zeros([0]),
                label_preds=np.zeros([0, 4]),
                sample_idx=sample_idx)
-        # Here default used 'CAM2' to compute metric. If you want to
+        # Here default used 'CAM_FRONT' to compute metric. If you want to
        # use another camera, please modify it.
-        if self.task in ['mv3d_det', 'lidar_det']:
+        if self.load_type in ['frame_based', 'fov_image_based']:
            cam_key = self.default_cam_key
-        elif self.task == 'mono_det':
+        elif self.load_type == 'mv_image_based':
            cam_key = list(info['images'].keys())[0]
        else:
            raise NotImplementedError
@@ -661,12 +668,12 @@ class WaymoMetric(KittiMetric):
                          (box_2d_preds[:, 1] < image_shape[0]) &
                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
        # check box_preds_lidar
-        if self.task in ['mv3d_det', 'lidar_det']:
+        if self.load_type in ['frame_based']:
            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
                              (box_preds_lidar.center < limit_range[3:]))
            valid_inds = valid_pcd_inds.all(-1)
-        elif self.task == 'mono_det':
+        if self.load_type in ['mv_image_based', 'fov_image_based']:
            valid_inds = valid_cam_inds

        if valid_inds.sum() > 0:

--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -133,6 +133,7 @@ class Waymo2KITTI(object):
            self.save_image(frame, file_idx, frame_idx)
            self.save_calib(frame, file_idx, frame_idx)
            if 'testing_3d_camera_only_detection' not in self.load_dir:
+                # the camera only split doesn't contain lidar points.
                self.save_lidar(frame, file_idx, frame_idx)
            self.save_pose(frame, file_idx, frame_idx)
            self.save_timestamp(frame, file_idx, frame_idx)