[Refactor] Refactor Mono3D models

b496f579 · ZCMax · ChaimZhu · 35667791 · b496f579 · b496f579
Commit b496f579 authored Jul 18, 2022 by ZCMax Committed by ChaimZhu Jul 20, 2022
20 changed files
--- a/configs/_base_/datasets/kitti-mono3d.py
+++ b/configs/_base_/datasets/kitti-mono3d.py
-dataset_type = 'KittiMonoDataset'
+dataset_type = 'KittiDataset'
 data_root = 'data/kitti/'
 class_names = ['Pedestrian', 'Cyclist', 'Car']
 input_modality = dict(use_lidar=False, use_camera=True)
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+metainfo = dict(CLASSES=class_names)
+
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/kitti/':
+        's3://openmmlab/datasets/detection3d/kitti/',
+        'data/kitti/':
+        's3://openmmlab/datasets/detection3d/kitti/'
+    }))
+
 train_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox=True,
@@ -14,79 +27,60 @@ train_pipeline = [
        with_bbox_3d=True,
        with_label_3d=True,
        with_bbox_depth=True),
-    dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
+    dict(type='Resize', scale=(1242, 375), keep_ratio=True),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
+        type='Pack3DDetInputs',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
-            'centers2d', 'depths'
+            'centers_2d', 'depths'
        ]),
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1242, 375),
-        flip=False,
-        transforms=[
-            dict(type='RandomFlip3D'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img']),
-        ])
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['img'])
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
+    dict(type='Resize', scale=(1242, 375), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
 ]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_train_mono3d.coco.json',
-        info_file=data_root + 'kitti_infos_train.pkl',
-        img_prefix=data_root,
-        classes=class_names,
+        ann_file='kitti_infos_train.pkl',
+        data_prefix=dict(img='training/image_2'),
        pipeline=train_pipeline,
        modality=input_modality,
        test_mode=False,
-        box_type_3d='Camera'),
-    val=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val_mono3d.coco.json',
-        info_file=data_root + 'kitti_infos_val.pkl',
-        img_prefix=data_root,
-        classes=class_names,
-        pipeline=test_pipeline,
-        modality=input_modality,
-        test_mode=True,
-        box_type_3d='Camera'),
-    test=dict(
+        metainfo=metainfo,
+        # we use box_type_3d='Camera' in monocular 3d
+        # detection task
+        box_type_3d='Camera'))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'kitti_infos_val_mono3d.coco.json',
-        info_file=data_root + 'kitti_infos_val.pkl',
-        img_prefix=data_root,
-        classes=class_names,
+        data_prefix=dict(img='training/image_2'),
+        ann_file='kitti_infos_val.pkl',
        pipeline=test_pipeline,
        modality=input_modality,
+        metainfo=metainfo,
        test_mode=True,
        box_type_3d='Camera'))
-evaluation = dict(interval=2)
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    pred_box_type_3d='Camera')
+
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/nus-mono3d.py
+++ b/configs/_base_/datasets/nus-mono3d.py
-dataset_type = 'NuScenesMonoDataset'
+dataset_type = 'NuScenesDataset'
 data_root = 'data/nuscenes/'
 class_names = [
    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
 ]
+metainfo = dict(CLASSES=class_names)
 # Input modality for nuScenes dataset, this is consistent with the submission
 # format which requires the information in input_modality.
-input_modality = dict(
-    use_lidar=False,
-    use_camera=True,
-    use_radar=False,
-    use_map=False,
-    use_external=False)
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/',
+        'data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/'
+    }))
+
 train_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox=True,
@@ -26,75 +34,77 @@ train_pipeline = [
        with_bbox_depth=True),
    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
+        type='Pack3DDetInputs',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
-            'gt_labels_3d', 'centers2d', 'depths'
+            'gt_labels_3d', 'centers_2d', 'depths'
        ]),
 ]
+
 test_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
-    dict(
-        type='MultiScaleFlipAug',
-        scale_factor=1.0,
-        flip=False,
-        transforms=[
-            dict(type='RandomFlip3D'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img']),
-        ])
-]
-# construct a pipeline for data and gt loading in show function
-# please keep its loading function consistent with test_pipeline (e.g. client)
-eval_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
-    dict(
-        type='DefaultFormatBundle3D',
-        class_names=class_names,
-        with_label=False),
-    dict(type='Collect3D', keys=['img'])
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
 ]

-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
-        img_prefix=data_root,
-        classes=class_names,
+        data_prefix=dict(
+            pts='',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        ann_file='nuscenes_infos_train.pkl',
+        task='mono3d',
        pipeline=train_pipeline,
+        metainfo=metainfo,
        modality=input_modality,
        test_mode=False,
-        box_type_3d='Camera'),
-    val=dict(
+        # we use box_type_3d='Camera' in monocular 3d
+        # detection task
+        box_type_3d='Camera',
+        use_valid_flag=True))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
        type=dataset_type,
        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
-        img_prefix=data_root,
-        classes=class_names,
+        data_prefix=dict(
+            pts='',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        ann_file='nuscenes_infos_val.pkl',
+        task='mono3d',
        pipeline=test_pipeline,
        modality=input_modality,
+        metainfo=metainfo,
        test_mode=True,
-        box_type_3d='Camera'),
-    test=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
-        img_prefix=data_root,
-        classes=class_names,
-        pipeline=test_pipeline,
-        modality=input_modality,
-        test_mode=True,
-        box_type_3d='Camera'))
-evaluation = dict(interval=2)
+        box_type_3d='Camera',
+        use_valid_flag=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='NuScenesMetric',
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox')
+
+test_evaluator = val_evaluator
--- a/configs/_base_/models/fcos3d.py
+++ b/configs/_base_/models/fcos3d.py
+# model settings
 model = dict(
    type='FCOSMono3D',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
    backbone=dict(
-        type='ResNet',
+        type='mmdet.ResNet',
        depth=101,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
@@ -13,7 +20,7 @@ model = dict(
            type='Pretrained',
            checkpoint='open-mmlab://detectron2/resnet101_caffe')),
    neck=dict(
-        type='FPN',
+        type='mmdet.FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
@@ -45,18 +52,19 @@ model = dict(
        dir_branch=(256, ),
        attr_branch=(256, ),
        loss_cls=dict(
-            type='FocalLoss',
+            type='mmdet.FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_attr=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_centerness=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
        norm_on_bbox=True,
        centerness_on_reg=True,

--- a/configs/_base_/models/pgd.py
+++ b/configs/_base_/models/pgd.py
@@ -28,18 +28,19 @@ model = dict(
        dir_branch=(256, ),
        attr_branch=(256, ),
        loss_cls=dict(
-            type='FocalLoss',
+            type='mmdet.FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_attr=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_centerness=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        norm_on_bbox=True,
        centerness_on_reg=True,
        center_sampling=True,

--- a/configs/_base_/models/smoke.py
+++ b/configs/_base_/models/smoke.py
+# model settings
 model = dict(
    type='SMOKEMono3D',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
    backbone=dict(
        type='DLANet',
        depth=34,
@@ -42,10 +49,11 @@ model = dict(
            base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63,
                                                                1.53)),
            code_size=7),
-        loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
-        loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='sum', loss_weight=1 / 300),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_attr=None,
        conv_bias=True,
        dcn_on_last_conv=False),

--- a/configs/_base_/schedules/mmdet_schedule_1x.py
+++ b/configs/_base_/schedules/mmdet_schedule_1x.py
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
 # optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-optimizer_config = dict(grad_clip=None)
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.001,
-    step=[8, 11])
-runner = dict(type='EpochBasedRunner', max_epochs=12)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
--- a/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py
+++ b/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py
@@ -4,18 +4,31 @@ _base_ = [
 ]
 # model settings
 model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
    backbone=dict(
        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
        stage_with_dcn=(False, False, True, True)))

-class_names = [
-    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
-    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
-]
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/',
+        'data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/'
+    }))
+
 train_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox=True,
@@ -24,52 +37,47 @@ train_pipeline = [
        with_bbox_3d=True,
        with_label_3d=True,
        with_bbox_depth=True),
-    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
+        type='Pack3DDetInputs',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
-            'gt_labels_3d', 'centers2d', 'depths'
+            'gt_labels_3d', 'centers_2d', 'depths'
        ]),
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
-    dict(
-        type='MultiScaleFlipAug',
-        scale_factor=1.0,
-        flip=False,
-        transforms=[
-            dict(type='RandomFlip3D'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img']),
-        ])
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img'])
 ]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(dataset=dict(pipeline=train_pipeline)))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
 # optimizer
-optimizer = dict(
-    lr=0.002, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[8, 11])
-total_epochs = 12
-evaluation = dict(interval=2)
+optim_wrapper = dict(
+    optimizer=dict(lr=0.002),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
--- a/configs/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py
+++ b/configs/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d.py
@@ -4,6 +4,12 @@ _base_ = [
 ]
 # model settings
 model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
    backbone=dict(frozen_stages=0),
    neck=dict(start_level=0, num_outs=4),
    bbox_head=dict(
@@ -27,16 +33,17 @@ model = dict(
        ),
        centerness_branch=(256, ),
        loss_cls=dict(
-            type='FocalLoss',
+            type='mmdet.FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0),
-        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
        loss_dir=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
        loss_centerness=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
        use_depth_classifier=True,
        depth_branch=(256, ),
        depth_range=(0, 70),
@@ -61,11 +68,21 @@ model = dict(
    ]),
    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))

-class_names = ['Pedestrian', 'Cyclist', 'Car']
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/kitti/':
+        's3://openmmlab/datasets/detection3d/kitti/',
+        'data/kitti/':
+        's3://openmmlab/datasets/detection3d/kitti/'
+    }))
+
 train_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox=True,
@@ -74,54 +91,47 @@ train_pipeline = [
        with_bbox_3d=True,
        with_label_3d=True,
        with_bbox_depth=True),
-    dict(type='Resize', img_scale=(1242, 375), keep_ratio=True),
+    dict(type='mmdet.Resize', scale=(1242, 375), keep_ratio=True),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
+        type='Pack3DDetInputs',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
-            'centers2d', 'depths'
+            'centers_2d', 'depths'
        ]),
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
-    dict(
-        type='MultiScaleFlipAug',
-        scale_factor=1.0,
-        flip=False,
-        transforms=[
-            dict(type='RandomFlip3D'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img']),
-        ])
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img'])
 ]
-data = dict(
-    samples_per_gpu=3,
-    workers_per_gpu=3,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+
+train_dataloader = dict(
+    batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
 # optimizer
-optimizer = dict(
-    lr=0.001, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-# learning policy
-lr_config = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=1.0 / 3,
-    step=[32, 44])
-total_epochs = 48
-runner = dict(type='EpochBasedRunner', max_epochs=48)
-evaluation = dict(interval=2)
-checkpoint_config = dict(interval=8)
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=48,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=48)
--- a/configs/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py
+++ b/configs/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d.py
@@ -3,21 +3,21 @@ _base_ = [
    '../_base_/default_runtime.py'
 ]

-# optimizer
-optimizer = dict(type='Adam', lr=2.5e-4)
-optimizer_config = dict(grad_clip=None)
-lr_config = dict(policy='step', warmup=None, step=[50])
-
-# runtime settings
-runner = dict(type='EpochBasedRunner', max_epochs=72)
-log_config = dict(interval=10)
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+file_client_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/kitti/':
+        's3://openmmlab/datasets/detection3d/kitti/',
+        'data/kitti/':
+        's3://openmmlab/datasets/detection3d/kitti/'
+    }))

-find_unused_parameters = True
-class_names = ['Pedestrian', 'Cyclist', 'Car']
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 train_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox=True,
@@ -29,36 +29,42 @@ train_pipeline = [
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='RandomShiftScale', shift_scale=(0.2, 0.4), aug_prob=0.3),
    dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
-        type='Collect3D',
+        type='Pack3DDetInputs',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d',
-            'centers2d', 'depths'
+            'centers_2d', 'depths'
        ]),
 ]
 test_pipeline = [
-    dict(type='LoadImageFromFileMono3D'),
+    dict(type='LoadImageFromFileMono3D', file_client_args=file_client_args),
+    dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1280, 384),
-        flip=False,
-        transforms=[
-            dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=class_names,
-                with_label=False),
-            dict(type='Collect3D', keys=['img']),
-        ])
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
 ]
-data = dict(
-    samples_per_gpu=8,
-    workers_per_gpu=4,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=2.5e-4),
+    clip_grad=None)
--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .builder import DATASETS, PIPELINES, build_dataset
+from .convert_utils import get_2d_boxes
 from .dataset_wrappers import CBGSDataset
 from .det3d_dataset import Det3DDataset
 from .kitti_dataset import KittiDataset
@@ -41,5 +42,5 @@ __all__ = [
    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
    'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
    'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
-    'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES'
+    'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES', 'get_2d_boxes'
 ]
--- a/mmdet3d/datasets/convert_utils.py
+++ b/mmdet3d/datasets/convert_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import List, Tuple, Union
+
+import numpy as np
+from nuscenes.utils.geometry_utils import view_points
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+
+from mmdet3d.core.bbox import points_cam2img
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+NameMapping = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck'
+}
+
+
+def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # if repro_rec is None, we do not append it into repre_recs
+        if repro_rec is not None:
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_3d'] = loc + dim + rot
+            repro_rec['velocity'] = velo
+
+            center_3d = np.array(loc).reshape([1, 3])
+            center_2d_with_depth = points_cam2img(
+                center_3d, camera_intrinsic, with_depth=True)
+            center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+            repro_rec['center_2d'] = center_2d_with_depth[:2]
+            repro_rec['depth'] = center_2d_with_depth[2]
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['depth'] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            # repro_rec['attribute_name'] = attr_name
+            repro_rec['attr_label'] = attr_id
+
+            repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various information on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample mono3D annotation record.
+            - bbox_label (int): 2d box label id
+            - bbox_label_3d (int): 3d box label id
+            - bbox (list[float]): left x, top y, right x, bottom y
+                of 2d box
+            - bbox_3d_isvalid (bool): whether the box is valid
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    if repro_rec['category_name'] not in NameMapping:
+        return None
+    cat_name = NameMapping[repro_rec['category_name']]
+    coco_rec['bbox_label'] = nus_categories.index(cat_name)
+    coco_rec['bbox_label_3d'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2, y2]
+    coco_rec['bbox_3d_isvalid'] = True
+
+    return coco_rec
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -197,6 +197,7 @@ class Det3DDataset(BaseDataset):
            ann_info = dict()
            for ann_name in keys:
                temp_anns = [item[ann_name] for item in instances]
+                # map the original dataset label to training label
                if 'label' in ann_name:
                    temp_anns = [
                        self.label_mapping[item] for item in temp_anns

--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
 from typing import Dict, List

 import numpy as np

+from mmdet3d.core.bbox.structures.cam_box3d import CameraInstance3DBoxes
 from mmdet3d.registry import DATASETS
 from ..core.bbox import LiDARInstance3DBoxes
 from .det3d_dataset import Det3DDataset
@@ -53,6 +55,7 @@ class NuScenesDataset(Det3DDataset):
    def __init__(self,
                 data_root: str,
                 ann_file: str,
+                 task: str = '3d',
                 pipeline: List[dict] = None,
                 box_type_3d: str = 'LiDAR',
                 modality: Dict = dict(
@@ -66,7 +69,12 @@ class NuScenesDataset(Det3DDataset):
                 **kwargs):
        self.use_valid_flag = use_valid_flag
        self.with_velocity = with_velocity
-        assert box_type_3d.lower() == 'lidar'
+
+        # TODO: Redesign multi-view data process in the future
+        assert task in ('3d', 'mono3d', 'multi-view')
+        self.task = task
+
+        assert box_type_3d.lower() in ('lidar', 'camera')
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
@@ -97,6 +105,7 @@ class NuScenesDataset(Det3DDataset):
            anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
            return anns_results
+
        if self.use_valid_flag:
            mask = ann_info['bbox_3d_isvalid']
        else:
@@ -104,6 +113,22 @@ class NuScenesDataset(Det3DDataset):
        gt_bboxes_3d = ann_info['gt_bboxes_3d'][mask]
        gt_labels_3d = ann_info['gt_labels_3d'][mask]

+        if 'gt_bboxes' in ann_info:
+            gt_bboxes = ann_info['gt_bboxes'][mask]
+            gt_labels = ann_info['gt_labels'][mask]
+            attr_labels = ann_info['attr_labels'][mask]
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            attr_labels = np.array([], dtype=np.int64)
+
+        if 'centers_2d' in ann_info:
+            centers_2d = ann_info['centers_2d'][mask]
+            depths = ann_info['depths'][mask]
+        else:
+            centers_2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
        if self.with_velocity:
            gt_velocity = ann_info['velocity'][mask]
            nan_mask = np.isnan(gt_velocity[:, 0])
@@ -112,11 +137,82 @@ class NuScenesDataset(Det3DDataset):

        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
        # the same as KITTI (0.5, 0.5, 0)
-        gt_bboxes_3d = LiDARInstance3DBoxes(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        # TODO: Unify the coordinates
+        if self.task == 'mono3d':
+            gt_bboxes_3d = CameraInstance3DBoxes(
+                gt_bboxes_3d,
+                box_dim=gt_bboxes_3d.shape[-1],
+                origin=(0.5, 0.5, 0.5))
+        else:
+            gt_bboxes_3d = LiDARInstance3DBoxes(
+                gt_bboxes_3d,
+                box_dim=gt_bboxes_3d.shape[-1],
+                origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)

        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_bboxes=gt_bboxes,
+            gt_labels=gt_labels,
+            attr_labels=attr_labels,
+            centers_2d=centers_2d,
+            depths=depths)
+
        return anns_results
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `plane`.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.task == 'mono3d':
+            data_list = []
+            if self.modality['use_lidar']:
+                info['lidar_points']['lidar_path'] = \
+                    osp.join(
+                        self.data_prefix.get('pts', ''),
+                        info['lidar_points']['lidar_path'])
+
+            if self.modality['use_camera']:
+                for cam_id, img_info in info['images'].items():
+                    if 'img_path' in img_info:
+                        if cam_id in self.data_prefix:
+                            cam_prefix = self.data_prefix[cam_id]
+                        else:
+                            cam_prefix = self.data_prefix.get('img', '')
+                        img_info['img_path'] = osp.join(
+                            cam_prefix, img_info['img_path'])
+
+            for idx, (cam_id, img_info) in enumerate(info['images'].items()):
+                camera_info = dict()
+                camera_info['images'] = dict()
+                camera_info['images'][cam_id] = img_info
+                if 'cam_instances' in info and cam_id in info['cam_instances']:
+                    camera_info['instances'] = info['cam_instances'][cam_id]
+                else:
+                    camera_info['instances'] = []
+                # TODO: check whether to change sample_idx for 6 cameras
+                #  in one frame
+                camera_info['sample_idx'] = info['sample_idx'] * 6 + idx
+                camera_info['token'] = info['token']
+                camera_info['ego2global'] = info['ego2global']
+
+                if not self.test_mode:
+                    # used in traing
+                    camera_info['ann_info'] = self.parse_ann_info(camera_info)
+                if self.test_mode and self.load_eval_anns:
+                    camera_info['eval_ann_info'] = \
+                        self.parse_ann_info(camera_info)
+                data_list.append(camera_info)
+            return data_list
+        else:
+            data_info = super().parse_data_info(info)
+            return data_info
--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -122,7 +122,7 @@ class Pack3DDetInputs(BaseTransform):
        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
-                'pts_semantic_mask', 'centers2d', 'depths'
+                'pts_semantic_mask', 'centers_2d', 'depths'
        ]:
            if key not in results:
                continue

--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -86,7 +86,7 @@ class LoadImageFromFileMono3D(LoadImageFromFile):
            :class:`LoadImageFromFile`.
    """

-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
        """Call functions to load image and get image meta information.

        Args:
@@ -95,8 +95,32 @@ class LoadImageFromFileMono3D(LoadImageFromFile):
        Returns:
            dict: The dict contains loaded image and meta information.
        """
-        super().__call__(results)
-        results['cam2img'] = results['img_info']['cam_intrinsic']
+        # TODO: load different camera image from data info,
+        # for kitti dataset, we load 'CAM2' image.
+        # for nuscenes dataset, we load 'CAM_FRONT' image.
+
+        if 'CAM2' in results['images']:
+            filename = results['images']['CAM2']['img_path']
+            results['cam2img'] = results['images']['CAM2']['cam2img']
+        elif len(list(results['images'].keys())) == 1:
+            camera_type = list(results['images'].keys())[0]
+            filename = results['images'][camera_type]['img_path']
+            results['cam2img'] = results['images'][camera_type]['cam2img']
+        else:
+            raise NotImplementedError(
+                'Currently we only support load image from kitti and'
+                'nuscenes datasets')
+
+        img_bytes = self.file_client.get(filename)
+        img = mmcv.imfrombytes(
+            img_bytes, flag=self.color_type, backend=self.imdecode_backend)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+
        return results


@@ -608,6 +632,34 @@ class LoadAnnotations3D(LoadAnnotations):
        self.with_seg_3d = with_seg_3d
        self.seg_3d_dtype = seg_3d_dtype

+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
+        datasets.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        results['gt_bboxes'] = results['ann_info']['gt_bboxes']
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Rewrite '_load_bboxes` since mmdet3d uses 'parse_anno_info' in
+        datasets.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        results['gt_labels'] = results['ann_info']['gt_labels']
+
    def _load_bboxes_3d(self, results: dict) -> dict:
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.

--- a/mmdet3d/datasets/pipelines/transforms_3d.py
+++ b/mmdet3d/datasets/pipelines/transforms_3d.py
@@ -1579,7 +1579,7 @@ class VoxelBasedPointSampler(object):


 @TRANSFORMS.register_module()
-class AffineResize(object):
+class AffineResize(BaseTransform):
    """Get the affine transform matrices to the target size.

    Different from :class:`RandomAffine` in MMDetection, this class can
@@ -1596,13 +1596,16 @@ class AffineResize(object):
            outside the border of the image. Defaults to True.
    """

-    def __init__(self, img_scale, down_ratio, bbox_clip_border=True):
+    def __init__(self,
+                 img_scale: Tuple,
+                 down_ratio: int,
+                 bbox_clip_border: bool = True) -> None:

        self.img_scale = img_scale
        self.down_ratio = down_ratio
        self.bbox_clip_border = bbox_clip_border

-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
        """Call function to do affine transform to input image and labels.

        Args:
@@ -1647,39 +1650,38 @@ class AffineResize(object):
        results['pad_shape'] = img.shape
        results['trans_mat'] = trans_mat

-        self._affine_bboxes(results, trans_affine)
+        if 'gt_bboxes' in results:
+            self._affine_bboxes(results, trans_affine)

-        if 'centers2d' in results:
-            centers2d = self._affine_transform(results['centers2d'],
+        if 'centers_2d' in results:
+            centers2d = self._affine_transform(results['centers_2d'],
                                               trans_affine)
            valid_index = (centers2d[:, 0] >
                           0) & (centers2d[:, 0] <
                                 self.img_scale[0]) & (centers2d[:, 1] > 0) & (
                                     centers2d[:, 1] < self.img_scale[1])
-            results['centers2d'] = centers2d[valid_index]
-
-            for key in results.get('bbox_fields', []):
-                if key in ['gt_bboxes']:
-                    results[key] = results[key][valid_index]
-                    if 'gt_labels' in results:
-                        results['gt_labels'] = results['gt_labels'][
-                            valid_index]
-                    if 'gt_masks' in results:
-                        raise NotImplementedError(
-                            'AffineResize only supports bbox.')
-
-            for key in results.get('bbox3d_fields', []):
-                if key in ['gt_bboxes_3d']:
-                    results[key].tensor = results[key].tensor[valid_index]
-                    if 'gt_labels_3d' in results:
-                        results['gt_labels_3d'] = results['gt_labels_3d'][
-                            valid_index]
+            results['centers_2d'] = centers2d[valid_index]
+
+            if 'gt_bboxes' in results:
+                results['gt_bboxes'] = results['gt_bboxes'][valid_index]
+                if 'gt_labels' in results:
+                    results['gt_labels'] = results['gt_labels'][valid_index]
+                if 'gt_masks' in results:
+                    raise NotImplementedError(
+                        'AffineResize only supports bbox.')
+
+            if 'gt_bboxes_3d' in results:
+                results['gt_bboxes_3d'].tensor = results[
+                    'gt_bboxes_3d'].tensor[valid_index]
+                if 'gt_labels_3d' in results:
+                    results['gt_labels_3d'] = results['gt_labels_3d'][
+                        valid_index]

            results['depths'] = results['depths'][valid_index]

        return results

-    def _affine_bboxes(self, results, matrix):
+    def _affine_bboxes(self, results: dict, matrix: np.ndarray) -> None:
        """Affine transform bboxes to input image.

        Args:
@@ -1689,20 +1691,18 @@ class AffineResize(object):
                shape: (3, 3)
        """

-        for key in results.get('bbox_fields', []):
-            bboxes = results[key]
-            bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
-            bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
-            if self.bbox_clip_border:
-                bboxes[:,
-                       [0, 2]] = bboxes[:,
-                                        [0, 2]].clip(0, self.img_scale[0] - 1)
-                bboxes[:,
-                       [1, 3]] = bboxes[:,
-                                        [1, 3]].clip(0, self.img_scale[1] - 1)
-            results[key] = bboxes
-
-    def _affine_transform(self, points, matrix):
+        bboxes = results['gt_bboxes']
+        bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
+        bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
+        if self.bbox_clip_border:
+            bboxes[:, [0, 2]] = bboxes[:, [0, 2]].clip(0,
+                                                       self.img_scale[0] - 1)
+            bboxes[:, [1, 3]] = bboxes[:, [1, 3]].clip(0,
+                                                       self.img_scale[1] - 1)
+        results['gt_bboxes'] = bboxes
+
+    def _affine_transform(self, points: np.ndarray,
+                          matrix: np.ndarray) -> np.ndarray:
        """Affine transform bbox points to input image.

        Args:
@@ -1721,7 +1721,8 @@ class AffineResize(object):
        affined_points = np.matmul(matrix, hom_points_2d).T
        return affined_points[:, :2]

-    def _get_transform_matrix(self, center, scale, output_scale):
+    def _get_transform_matrix(self, center: Tuple, scale: Tuple,
+                              output_scale: Tuple[float]) -> np.ndarray:
        """Get affine transform matrix.

        Args:
@@ -1756,7 +1757,8 @@ class AffineResize(object):

        return matrix.astype(np.float32)

-    def _get_ref_point(self, ref_point1, ref_point2):
+    def _get_ref_point(self, ref_point1: np.ndarray,
+                       ref_point2: np.ndarray) -> np.ndarray:
        """Get reference point to calculate affine transform matrix.

        While using opencv to calculate the affine matrix, we need at least
@@ -1775,7 +1777,7 @@ class AffineResize(object):


 @TRANSFORMS.register_module()
-class RandomShiftScale(object):
+class RandomShiftScale(BaseTransform):
    """Random shift scale.

    Different from the normal shift and scale function, it doesn't
@@ -1788,12 +1790,12 @@ class RandomShiftScale(object):
        aug_prob (float): The shifting and scaling probability.
    """

-    def __init__(self, shift_scale, aug_prob):
+    def __init__(self, shift_scale: Tuple[float], aug_prob: float):

        self.shift_scale = shift_scale
        self.aug_prob = aug_prob

-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
        """Call function to record random shift and scale infos.

        Args:

--- a/mmdet3d/metrics/kitti_metric.py
+++ b/mmdet3d/metrics/kitti_metric.py
@@ -45,6 +45,7 @@ class KittiMetric(BaseMetric):
    def __init__(self,
                 ann_file: str,
                 metric: Union[str, List[str]] = 'bbox',
+                 pred_box_type_3d: str = 'LiDAR',
                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
                 prefix: Optional[str] = None,
                 pklfile_prefix: str = None,
@@ -57,6 +58,7 @@ class KittiMetric(BaseMetric):
        self.ann_file = ann_file
        self.pklfile_prefix = pklfile_prefix
        self.submission_prefix = submission_prefix
+        self.pred_box_type_3d = pred_box_type_3d

        allowed_metrics = ['bbox', 'img_bbox', 'mAP']
        self.metrics = metric if isinstance(metric, list) else [metric]

--- a/mmdet3d/metrics/nuscenes_metric.py
+++ b/mmdet3d/metrics/nuscenes_metric.py
@@ -7,12 +7,15 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
 import mmcv
 import numpy as np
 import pyquaternion
+import torch
 from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger
 from nuscenes.eval.detection.config import config_factory
 from nuscenes.eval.detection.data_classes import DetectionConfig
 from nuscenes.utils.data_classes import Box as NuScenesBox

+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet3d.core.bbox import CameraInstance3DBoxes, LiDARInstance3DBoxes
 from mmdet3d.registry import METRICS


@@ -288,21 +291,144 @@ class NuScenesMetric(BaseMetric):

        for name in results[0]:
            if 'pred' in name and '3d' in name and name[0] != '_':
-                # format result of model output in Det3dDataSample,
-                # include 'pred_instances_3d','pts_pred_instances_3d',
-                # 'img_pred_instances_3d'
                print(f'\nFormating bboxes of {name}')
                results_ = [out[name] for out in results]
                tmp_file_ = osp.join(jsonfile_prefix, name)
-                result_dict[name] = self._format_bbox(results_, sample_id_list,
-                                                      classes, tmp_file_)
+                box_type_3d = type(results_[0]['bboxes_3d'])
+                if box_type_3d == LiDARInstance3DBoxes:
+                    result_dict[name] = self._format_lidar_bbox(
+                        results_, sample_id_list, classes, tmp_file_)
+                elif box_type_3d == CameraInstance3DBoxes:
+                    result_dict[name] = self._format_camera_bbox(
+                        results_, sample_id_list, classes, tmp_file_)
+
        return result_dict, tmp_dir

-    def _format_bbox(self,
-                     results: List[dict],
-                     sample_id_list: List[int],
-                     classes: List[str] = None,
-                     jsonfile_prefix: str = None) -> str:
+    def _format_camera_bbox(self,
+                            results: List[dict],
+                            sample_id_list: List[int],
+                            classes: List[str] = None,
+                            jsonfile_prefix: str = None) -> str:
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+
+        print('Start to convert detection format...')
+
+        # Camera types in Nuscenes datasets
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+
+        CAM_NUM = 6
+
+        for i, det in enumerate(mmcv.track_iter_progress(results)):
+
+            sample_id = sample_id_list[i]
+
+            camera_type_id = sample_id % CAM_NUM
+
+            if camera_type_id == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            camera_type = camera_types[camera_type_id]
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id - camera_type_id], boxes, attrs,
+                camera_type, classes, self.eval_detection_configs)
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_id + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+                classes, self.eval_detection_configs)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            from mmcv import Config
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+                classes, self.eval_detection_configs)
+
+            for i, box in enumerate(boxes):
+                name = classes[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _format_lidar_bbox(self,
+                           results: List[dict],
+                           sample_id_list: List[int],
+                           classes: List[str] = None,
+                           jsonfile_prefix: str = None) -> str:
        """Convert the results to the standard format.

        Args:
@@ -389,27 +515,59 @@ def output_to_nusc_box(detection: dict) -> List[NuScenesBox]:
    bbox3d = detection['bboxes_3d']
    scores = detection['scores_3d'].numpy()
    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attr_labels' in detection:
+        attrs = detection['attr_labels'].numpy()

    box_gravity_center = bbox3d.gravity_center.numpy()
    box_dims = bbox3d.dims.numpy()
    box_yaw = bbox3d.yaw.numpy()

-    # our LiDAR coordinate system -> nuScenes box coordinate system
-    nus_box_dims = box_dims[:, [1, 0, 2]]
-
    box_list = []
-    for i in range(len(bbox3d)):
-        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
-        velocity = (*bbox3d.tensor[i, 7:9], 0.0)
-        box = NuScenesBox(
-            box_gravity_center[i],
-            nus_box_dims[i],
-            quat,
-            label=labels[i],
-            score=scores[i],
-            velocity=velocity)
-        box_list.append(box)
-    return box_list
+
+    if type(bbox3d) == LiDARInstance3DBoxes:
+        # our LiDAR coordinate system -> nuScenes box coordinate system
+        nus_box_dims = box_dims[:, [1, 0, 2]]
+        for i in range(len(bbox3d)):
+            quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+            velocity = (*bbox3d.tensor[i, 7:9], 0.0)
+            # velo_val = np.linalg.norm(box3d[i, 7:9])
+            # velo_ori = box3d[i, 6]
+            # velocity = (
+            # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+            box = NuScenesBox(
+                box_gravity_center[i],
+                nus_box_dims[i],
+                quat,
+                label=labels[i],
+                score=scores[i],
+                velocity=velocity)
+            box_list.append(box)
+    elif type(bbox3d) == CameraInstance3DBoxes:
+        # our Camera coordinate system -> nuScenes box coordinate system
+        # convert the dim/rot to nuscbox convention
+        nus_box_dims = box_dims[:, [2, 0, 1]]
+        nus_box_yaw = -box_yaw
+        for i in range(len(bbox3d)):
+            q1 = pyquaternion.Quaternion(
+                axis=[0, 0, 1], radians=nus_box_yaw[i])
+            q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+            quat = q2 * q1
+            velocity = (bbox3d.tensor[i, 7], 0.0, bbox3d.tensor[i, 8])
+            box = NuScenesBox(
+                box_gravity_center[i],
+                nus_box_dims[i],
+                quat,
+                label=labels[i],
+                score=scores[i],
+                velocity=velocity)
+            box_list.append(box)
+    else:
+        raise NotImplementedError(
+            f'Do not support convert {type(bbox3d)} bboxes'
+            'to standard NuScenesBoxes.')
+
+    return box_list, attrs


 def lidar_nusc_box_to_global(
@@ -448,3 +606,117 @@ def lidar_nusc_box_to_global(
        box.translate(ego2global[:3, 3])
        box_list.append(box)
    return box_list
+
+
+def cam_nusc_box_to_global(info: dict, boxes: List[NuScenesBox],
+                           attrs: List[str], camera_type: str,
+                           classes: List[str],
+                           eval_configs: DetectionConfig) -> List[NuScenesBox]:
+    """Convert the box from camera to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        attrs (list[str]): List of attributes.
+        camera_type (str): Type of camera.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        cam2ego = np.array(info['images'][camera_type]['cam2ego'])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05, atol=1e-07))
+        box.translate(cam2ego[:3, 3])
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        ego2global = np.array(info['ego2global'])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05, atol=1e-07))
+        box.translate(ego2global[:3, 3])
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info: dict, boxes: List[NuScenesBox],
+                           classes: List[str],
+                           eval_configs: DetectionConfig) -> List[NuScenesBox]:
+    """Convert the box from global to camera coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        ego2global = np.array(info['ego2global'])
+        box.translate(-ego2global[:3, 3])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05,
+                                    atol=1e-07).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        cam2ego = np.array(info['images']['CAM_FRONT']['cam2ego'])
+        box.translate(-cam2ego[:3, :3])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05,
+                                    atol=1e-07).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(boxes: List[NuScenesBox]):
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+
+    Args:
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+
+    Returns:
+        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor):
+            Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2)
+
+    # convert nusbox to cambox convention
+    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+    rots = -rots
+
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
--- a/mmdet3d/models/data_preprocessors/data_preprocessor.py
+++ b/mmdet3d/models/data_preprocessors/data_preprocessor.py
@@ -106,8 +106,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        if 'points' in inputs_dict[0].keys():
            points = [input['points'] for input in inputs_dict]
        else:
-            raise KeyError(
-                "Model input dict needs to include the 'points' key.")
+            points = None

        if 'img' in inputs_dict[0].keys():


--- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import abstractmethod
+from typing import Any, List, Sequence, Tuple, Union

 import torch
 from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
-from mmcv.runner import force_fp32
+from torch import Tensor
 from torch import nn as nn

+from mmdet3d.core.utils import ConfigType, InstanceList, OptConfigType
 from mmdet3d.registry import MODELS
 from mmdet.core import multi_apply
-from ..builder import build_loss
 from .base_mono3d_dense_head import BaseMono3DDenseHead


@@ -20,39 +21,41 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        num_classes (int): Number of categories excluding the background
            category.
        in_channels (int): Number of channels in the input feature map.
-        feat_channels (int, optional): Number of hidden channels.
+        feat_channels (int): Number of hidden channels.
            Used in child classes. Defaults to 256.
-        stacked_convs (int, optional): Number of stacking convs of the head.
-        strides (tuple, optional): Downsample factor of each feature map.
-        dcn_on_last_conv (bool, optional): If true, use dcn in the last
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
+            factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last
            layer of towers. Default: False.
-        conv_bias (bool | str, optional): If specified as `auto`, it will be
+        conv_bias (bool or str): If specified as `auto`, it will be
            decided by the norm_cfg. Bias of conv will be set as True
            if `norm_cfg` is None, otherwise False. Default: 'auto'.
-        background_label (int, optional): Label ID of background,
+        background_label (bool, Optional): Label ID of background,
            set as 0 for RPN and num_classes for other heads.
            It will automatically set as `num_classes` if None is given.
-        use_direction_classifier (bool, optional):
+        use_direction_classifier (bool):
            Whether to add a direction classifier.
-        diff_rad_by_sin (bool, optional): Whether to change the difference
+        diff_rad_by_sin (bool): Whether to change the difference
            into sin difference for box regression loss. Defaults to True.
-        dir_offset (float, optional): Parameter used in direction
+        dir_offset (float): Parameter used in direction
            classification. Defaults to 0.
-        dir_limit_offset (float, optional): Parameter used in direction
+        dir_limit_offset (float): Parameter used in direction
            classification. Defaults to 0.
-        loss_cls (dict, optional): Config of classification loss.
-        loss_bbox (dict, optional): Config of localization loss.
-        loss_dir (dict, optional): Config of direction classifier loss.
-        loss_attr (dict, optional): Config of attribute classifier loss,
-            which is only active when `pred_attrs=True`.
-        bbox_code_size (int, optional): Dimensions of predicted bounding boxes.
-        pred_attrs (bool, optional): Whether to predict attributes.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dir (:obj:`ConfigDict` or dict): Config of direction classifier
+            loss.
+        loss_attr (:obj:`ConfigDict` or dict): Config of attribute classifier
+            loss, which is only active when `pred_attrs=True`.
+        bbox_code_size (int): Dimensions of predicted bounding boxes.
+        pred_attrs (bool): Whether to predict attributes.
            Defaults to False.
-        num_attrs (int, optional): The number of attributes to be predicted.
+        num_attrs (int): The number of attributes to be predicted.
            Default: 9.
-        pred_velo (bool, optional): Whether to predict velocity.
+        pred_velo (bool): Whether to predict velocity.
            Defaults to False.
-        pred_bbox2d (bool, optional): Whether to predict 2D boxes.
+        pred_bbox2d (bool): Whether to predict 2D boxes.
            Defaults to False.
        group_reg_dims (tuple[int], optional): The dimension of each regression
            target group. Default: (2, 1, 3, 1, 2).
@@ -66,68 +69,77 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
                (64, ),  # rot
                ()  # velo
            ),
-        dir_branch (tuple[int], optional): Channels for direction
+        dir_branch (Sequence[int]): Channels for direction
            classification branch. Default: (64, ).
-        attr_branch (tuple[int], optional): Channels for classification branch.
+        attr_branch (Sequence[int]): Channels for classification branch.
            Default: (64, ).
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        train_cfg (dict, optional): Training config of anchor head.
-        test_cfg (dict, optional): Testing config of anchor head.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            convolution layer. Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            normalization layer. Default: None.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config
+            of anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
    """  # noqa: W605

    _version = 1

    def __init__(
            self,
-            num_classes,
-            in_channels,
-            feat_channels=256,
-            stacked_convs=4,
-            strides=(4, 8, 16, 32, 64),
-            dcn_on_last_conv=False,
-            conv_bias='auto',
-            background_label=None,
-            use_direction_classifier=True,
-            diff_rad_by_sin=True,
-            dir_offset=0,
-            dir_limit_offset=0,
-            loss_cls=dict(
-                type='FocalLoss',
+            num_classes: int,
+            in_channels: int,
+            feat_channels: int = 256,
+            stacked_convs: int = 4,
+            strides: Sequence[int] = (4, 8, 16, 32, 64),
+            dcn_on_last_conv: bool = False,
+            conv_bias: Union[bool, str] = 'auto',
+            background_label: bool = None,
+            use_direction_classifier: bool = True,
+            diff_rad_by_sin: bool = True,
+            dir_offset: int = 0,
+            dir_limit_offset: int = 0,
+            loss_cls: ConfigType = dict(
+                type='mmdet.FocalLoss',
                use_sigmoid=True,
                gamma=2.0,
                alpha=0.25,
                loss_weight=1.0),
-            loss_bbox=dict(
-                type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-            loss_dir=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            loss_attr=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            bbox_code_size=9,  # For nuscenes
-            pred_attrs=False,
-            num_attrs=9,  # For nuscenes
-            pred_velo=False,
-            pred_bbox2d=False,
-            group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo,
-            cls_branch=(128, 64),
-            reg_branch=(
+            loss_bbox: ConfigType = dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir: ConfigType = dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_attr: ConfigType = dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            bbox_code_size: int = 9,  # For nuscenes
+            pred_attrs: bool = False,
+            num_attrs: int = 9,  # For nuscenes
+            pred_velo: bool = False,
+            pred_bbox2d: bool = False,
+            group_reg_dims: Sequence[int] = (
+                2, 1, 3, 1, 2),  # offset, depth, size, rot, velo,
+            cls_branch: Sequence[int] = (128, 64),
+            reg_branch: Sequence[Tuple[int, int]] = (
                (128, 64),  # offset
                (128, 64),  # depth
                (64, ),  # size
                (64, ),  # rot
                ()  # velo
            ),
-            dir_branch=(64, ),
-            attr_branch=(64, ),
-            conv_cfg=None,
-            norm_cfg=None,
-            train_cfg=None,
-            test_cfg=None,
-            init_cfg=None):
-        super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg)
+            dir_branch: Sequence[int] = (64, ),
+            attr_branch: Sequence[int] = (64, ),
+            conv_cfg: OptConfigType = None,
+            norm_cfg: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            init_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=init_cfg)
        self.num_classes = num_classes
        self.cls_out_channels = num_classes
        self.in_channels = in_channels
@@ -141,9 +153,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        self.diff_rad_by_sin = diff_rad_by_sin
        self.dir_offset = dir_offset
        self.dir_limit_offset = dir_limit_offset
-        self.loss_cls = build_loss(loss_cls)
-        self.loss_bbox = build_loss(loss_bbox)
-        self.loss_dir = build_loss(loss_dir)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_dir = MODELS.build(loss_dir)
        self.bbox_code_size = bbox_code_size
        self.group_reg_dims = list(group_reg_dims)
        self.cls_branch = cls_branch
@@ -174,7 +186,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        self.num_attrs = num_attrs
        if self.pred_attrs:
            self.attr_background_label = num_attrs
-            self.loss_attr = build_loss(loss_attr)
+            self.loss_attr = MODELS.build(loss_attr)
            self.attr_branch = attr_branch

        self._init_layers()
@@ -316,11 +328,13 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        if self.pred_attrs:
            normal_init(self.conv_attr, std=0.01, bias=bias_cls)

-    def forward(self, feats):
+    def forward(
+        self, x: Tuple[Tensor]
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
        """Forward features from the upstream network.

        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
+            x (tuple[Tensor]): Features from the upstream network, each is
                a 4D-tensor.

        Returns:
@@ -339,9 +353,9 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
                    level, each is a 4D-tensor, the channel number is
                    num_points * num_attrs.
        """
-        return multi_apply(self.forward_single, feats)[:5]
+        return multi_apply(self.forward_single, x)[:5]

-    def forward_single(self, x):
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
        """Forward features of a single scale level.

        Args:
@@ -394,77 +408,8 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
            reg_feat

    @abstractmethod
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             attr_preds,
-             batch_gt_instances_3d,
-             batch_img_metas,
-             batch_gt_instances_ignore=None):
-        """Compute loss of the head.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_classes.
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            attr_preds (list[Tensor]): Box scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_attrs.
-            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels3d``、``depths``、``centers2d`` and
-                attributes.
-            batch_img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
-                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
-                data that is ignored during training and testing.
-                Defaults to None.
-        """
-
-        raise NotImplementedError
-
-    @abstractmethod
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def get_results(self,
-                    cls_scores,
-                    bbox_preds,
-                    dir_cls_preds,
-                    attr_preds,
-                    batch_img_metas,
-                    cfg=None,
-                    rescale=None):
-        """Transform network output for a batch into bbox predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level
-                Has shape (N, num_points * num_classes, H, W)
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level with shape (N, num_points * bbox_code_size, H, W)
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            attr_preds (list[Tensor]): Attribute scores for each scale level
-                Has shape (N, num_points * num_attrs, H, W)
-            batch_img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used
-            rescale (bool): If True, return boxes in original image space
-        """
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_targets(self, points, batch_gt_instances_3d):
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances: InstanceList) -> Any:
        """Compute regression, classification and centerss targets for points
        in multiple images.

@@ -473,18 +418,32 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
                (num_points, 2).
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels3d``、``depths``、``centers2d`` and
-                attributes.
+                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d``
+                and attributes.
        """
        raise NotImplementedError

+    # TODO: Refactor using MlvlPointGenerator in MMDet.
    def _get_points_single(self,
-                           featmap_size,
-                           stride,
-                           dtype,
-                           device,
-                           flatten=False):
-        """Get points of a single scale level."""
+                           featmap_size: Tuple[int],
+                           stride: int,
+                           dtype: torch.dtype,
+                           device: torch.device,
+                           flatten: bool = False) -> Tuple[Tensor, Tensor]:
+        """Get points of a single scale level.
+
+        Args:
+            featmap_size (tuple[int]): Single scale level feature map
+                size.
+            stride (int): Downsample factor of the feature map.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+            flatten (bool): Whether to flatten the tensor.
+                Defaults to False.
+
+        Returns:
+            tuple: points of each image.
+        """
        h, w = featmap_size
        x_range = torch.arange(w, dtype=dtype, device=device)
        y_range = torch.arange(h, dtype=dtype, device=device)
@@ -494,16 +453,23 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
            x = x.flatten()
        return y, x

-    def get_points(self, featmap_sizes, dtype, device, flatten=False):
+    # TODO: Refactor using MlvlPointGenerator in MMDet.
+    def get_points(self,
+                   featmap_sizes: List[Tuple[int]],
+                   dtype: torch.dtype,
+                   device: torch.device,
+                   flatten: bool = False) -> List[Tuple[Tensor, Tensor]]:
        """Get points according to feature map sizes.

        Args:
            featmap_sizes (list[tuple]): Multi-level feature map sizes.
            dtype (torch.dtype): Type of points.
            device (torch.device): Device of points.
+            flatten (bool): Whether to flatten the tensor.
+                Defaults to False.

        Returns:
-            tuple: points of each image.
+            list[tuple]: points of each image.
        """
        mlvl_points = []
        for i in range(len(featmap_sizes)):