[Features] Support waymo challenge solution (#1716)

* update evaluation metric to support waymo cam only evaluation * add transformation for bev detection * add multiview dfm * support multiview detection in datasets with transformation, dfm model and metric * remove deprecated config and update doc string * remove file_client_args=file_client_args and update docstr * add doc string and remove pdb * fix the doc string of voxel fusion * add doc string * remove lidar2img * add doc string * update doc string * support waymo dataset for replace_ceph and modify path of pkl in config * update evaluation metrics; and the config for waymo solution * fix the index error in waymo metric and add lidar2img utils function * replace __call_ to transform * fix doc string * rename configs * update the config name * update the lidar2cam calib in waymo data creater Co-authored-by: lianqing <lianqing1997@gmail.com> Co-authored-by: Tai-Wang <tab_wang@outlook.com>

[Features] Support waymo challenge solution (#1716)
* update evaluation metric to support waymo cam only evaluation * add transformation for bev detection * add multiview dfm * support multiview detection in datasets with transformation, dfm model and metric * remove deprecated config and update doc string * remove file_client_args=file_client_args and update docstr * add doc string and remove pdb * fix the doc string of voxel fusion * add doc string * remove lidar2img * add doc string * update doc string * support waymo dataset for replace_ceph and modify path of pkl in config * update evaluation metrics; and the config for waymo solution * fix the index error in waymo metric and add lidar2img utils function * replace __call_ to transform * fix doc string * rename configs * update the config name * update the lidar2cam calib in waymo data creater Co-authored-by: lianqing <lianqing1997@gmail.com> Co-authored-by: Tai-Wang <tab_wang@outlook.com>
6d3518d0 · lianqing01 · GitHub · 25e38012 · 6d3518d0 · 6d3518d0
Unverified Commit 6d3518d0 authored Oct 12, 2022 by lianqing01 Committed by GitHub Oct 12, 2022
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 # Waymo evaluation
 mmdet3d/core/evaluation/waymo_utils/compute_detection_metrics_main
+mmdet3d/core/evaluation/waymo_utils/compute_detection_let_metrics_main
--- a/configs/_base_/datasets/waymoD5-mono3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-mono3d-3class.py
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1284, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+metainfo = dict(CLASSES=class_names)
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        task='mono3d',
+        # load one frame every three frames
+        load_interval=5))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        task='mono3d',
+    ))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        task='mono3d',
+    ))
+val_evaluator = dict(
+    type='WaymoMetric',
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    task='mono3d')
+test_evaluator = val_evaluator
--- a/configs/_base_/datasets/waymoD5-mv3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-mv3d-3class.py
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4]
+train_transforms = [
+    dict(type='PhotoMetricDistortion3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True),
+    dict(type='RandomCrop3D', crop_size=(720, 1080)),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_box3d=False),
+]
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(
+        type='Pack3DDetInputs', keys=[
+            'img',
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+        ]),
+]
+test_transforms = [
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+metainfo = dict(CLASSES=class_names)
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4',
+        ),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        load_interval=5,
+    ))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4',
+        ),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+    ))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_RIGHT='training/image_1',
+            CAM_FRONT_LEFT='training/image_2',
+            CAM_SIDE_RIGHT='training/image_3',
+            CAM_SIDE_LEFT='training/image_4',
+        ),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+    ))
+val_evaluator = dict(
+    type='WaymoMetric',
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP')
+test_evaluator = val_evaluator
--- a/configs/_base_/models/multiview_dfm.py
+++ b/configs/_base_/models/multiview_dfm.py
+model = dict(
+    type='MultiViewDfM',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=64,
+        num_outs=4),
+    neck_2d=None,
+    bbox_head_2d=None,
+    backbone_stereo=None,
+    depth_head=None,
+    backbone_3d=None,
+    neck_3d=dict(type='OutdoorImVoxelNeck', in_channels=64, out_channels=256),
+    valid_sample=True,
+    voxel_size=(0.5, 0.5, 0.5),  # n_voxels=[240, 300, 12]
+    anchor_generator=dict(
+        type='AlignedAnchor3DRangeGenerator',
+        ranges=[[-35.0, -75.0, -2, 75.0, 75.0, 4]],
+        rotations=[.0]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345],
+                    [-35.0, -75.0, 0, 75.0, 75.0, 0],
+                    [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188]],
+            sizes=[
+                [4.73, 2.08, 1.77],  # car
+                [0.91, 0.84, 1.74],  # pedestrian
+                [1.81, 0.84, 1.77],  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.05,
+        score_thr=0.001,
+        min_bbox_size=0,
+        nms_pre=500,
+        max_num=100))
--- a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py
+++ b/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py
+_base_ = [
+    '../_base_/datasets/waymoD5-mv3d-3class.py',
+    '../_base_/models/multiview_dfm.py'
+]
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0005, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+# hooks
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+)
+# training schedule for 2x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# runtime
+default_scope = 'mmdet3d'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+log_level = 'INFO'
+load_from = None
+resume = False
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
--- a/configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
+++ b/configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
+_base_ = ['./multiview-dfm_r101_dcn_2x16_waymoD5-3d-3class.py']
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='CenterHead',
+        in_channels=256,
+        tasks=[
+            dict(num_class=1, class_names=['Pedestrian']),
+            dict(num_class=1, class_names=['Cyclist']),
+            dict(num_class=1, class_names=['Car']),
+        ],
+        common_heads=dict(reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
+            pc_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
+            max_num=2000,
+            score_threshold=0,
+            out_size_factor=1,
+            voxel_size=(.50, .50),
+            code_size=7),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    train_cfg=dict(
+        _delete_=True,
+        grid_size=[220, 300, 1],
+        voxel_size=(0.5, 0.5, 6),
+        out_size_factor=1,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        point_cloud_range=[-35.0, -75.0, -2, 75.0, 75.0, 4]),
+    test_cfg=dict(
+        _delete_=True,
+        post_center_limit_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
+        max_per_img=4096,
+        max_pool_nms=False,
+        min_radius=[0.5, 2, 6],
+        score_threshold=0,
+        out_size_factor=1,
+        voxel_size=(0.5, 0.5),
+        nms_type='circle',
+        pre_max_size=2000,
+        post_max_size=200,
+        nms_thr=0.2))
--- a/mmdet3d/datasets/transforms/__init__.py
+++ b/mmdet3d/datasets/transforms/__init__.py
@@ -11,8 +11,9 @@ from .test_time_aug import MultiScaleFlipAug3D
 from .transforms_3d import (AffineResize, BackgroundPointsFilter,
                            GlobalAlignment, GlobalRotScaleTrans,
                            IndoorPatchPointSample, IndoorPointSample,
-                            ObjectNameFilter, ObjectNoise, ObjectRangeFilter,
+                            MultiViewWrapper, ObjectNameFilter, ObjectNoise,
-                            ObjectSample, PointSample, PointShuffle,
+                            ObjectRangeFilter, ObjectSample,
+                            PhotoMetricDistortion3D, PointSample, PointShuffle,
                            PointsRangeFilter, RandomDropPointsColor,
                            RandomFlip3D, RandomJitterPoints, RandomResize3D,
                            RandomShiftScale, Resize3D, VoxelBasedPointSampler)
@@ -29,5 +30,6 @@ __all__ = [
    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
-    'LoadPointsFromDict', 'Resize3D', 'RandomResize3D'
+    'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
+    'MultiViewWrapper', 'PhotoMetricDistortion3D'
 ]
--- a/mmdet3d/datasets/transforms/formating.py
+++ b/mmdet3d/datasets/transforms/formating.py
@@ -63,7 +63,6 @@ class Pack3DDetInputs(BaseTransform):
    def __init__(
        self,
-        keys: tuple,
        meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
                            'depth2img', 'cam2img', 'pad_shape',
                            'scale_factor', 'flip', 'pcd_horizontal_flip',
@@ -72,8 +71,10 @@ class Pack3DDetInputs(BaseTransform):
                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
                            'pcd_rotation_angle', 'lidar_path',
                            'transformation_3d_flow', 'trans_mat',
-                            'affine_aug')
+                            'affine_aug', 'sweep_img_metas', 'ori_cam2img',
-    ) -> None:
+                            'cam2global', 'crop_offset', 'img_crop_offset',
+                            'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
+                            'num_ref_frames', 'num_views', 'ego2global')) -> None:
        self.keys = keys
        self.meta_keys = meta_keys

--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
+import copy
+from typing import List, Optional, Union
 import mmcv
 import mmengine
@@ -23,15 +24,38 @@ class LoadMultiViewImageFromFiles(BaseTransform):
            Defaults to False.
        color_type (str, optional): Color type of the file.
            Defaults to 'unchanged'.
+        file_client_args (dict): Config dict of file clients,
+            refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details. Defaults to dict(backend='disk').
+        num_views (int): num of view in a frame. Default to 5.
+        num_ref_frames (int): num of frame in loading. Default to -1.
+        test_mode (bool): Whether is test mode in loading. Default to False.
+        set_default_scale (bool): Whether to set default scale. Default to
+        True.
    """
    def __init__(self,
                 to_float32: bool = False,
-                 color_type: str = 'unchanged') -> None:
+                 color_type: str = 'unchanged',
+                 file_client_args: dict = dict(backend='disk'),
+                 num_views: int = 5,
+                 num_ref_frames: int = -1,
+                 test_mode: bool = False,
+                 set_default_scale: bool = True) -> None:
        self.to_float32 = to_float32
        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+        self.num_views = num_views
+        # num_ref_frames is used for multi-sweep loading
+        self.num_ref_frames = num_ref_frames
+        # when test_mode=False, we randomly select previous frames
+        # otherwise, select the earliest one
+        self.test_mode = test_mode
+        self.set_default_scale = set_default_scale
-    def transform(self, results: dict) -> dict:
+    def transform(self, results: dict) -> Optional[dict]:
        """Call function to load multi-view image from files.
        Args:
@@ -49,33 +73,151 @@ class LoadMultiViewImageFromFiles(BaseTransform):
                - scale_factor (float): Scale factor.
                - img_norm_cfg (dict): Normalization configuration of images.
        """
-        filename = results['img_filename']
+        # TODO: consider split the multi-sweep part out of this pipeline
+        # Derive the mask and transform for loading of multi-sweep data
+        if self.num_ref_frames > 0:
+            # init choice with the current frame
+            init_choice = np.array([0], dtype=np.int64)
+            num_frames = len(results['img_filename']) // self.num_views - 1
+            if num_frames == 0:  # no previous frame, then copy cur frames
+                choices = np.random.choice(
+                    1, self.num_ref_frames, replace=True)
+            elif num_frames >= self.num_ref_frames:
+                # NOTE: suppose the info is saved following the order
+                # from latest to earlier frames
+                if self.test_mode:
+                    choices = np.arange(num_frames - self.num_ref_frames,
+                                        num_frames) + 1
+                # NOTE: +1 is for selecting previous frames
+                else:
+                    choices = np.random.choice(
+                        num_frames, self.num_ref_frames, replace=False) + 1
+            elif num_frames > 0 and num_frames < self.num_ref_frames:
+                if self.test_mode:
+                    base_choices = np.arange(num_frames) + 1
+                    random_choices = np.random.choice(
+                        num_frames,
+                        self.num_ref_frames - num_frames,
+                        replace=True) + 1
+                    choices = np.concatenate([base_choices, random_choices])
+                else:
+                    choices = np.random.choice(
+                        num_frames, self.num_ref_frames, replace=True) + 1
+            else:
+                raise NotImplementedError
+            choices = np.concatenate([init_choice, choices])
+            select_filename = []
+            for choice in choices:
+                select_filename += results['img_filename'][choice *
+                                                           self.num_views:
+                                                           (choice + 1) *
+                                                           self.num_views]
+            results['img_filename'] = select_filename
+            for key in ['cam2img', 'lidar2cam']:
+                if key in results:
+                    select_results = []
+                    for choice in choices:
+                        select_results += results[key][choice *
+                                                       self.num_views:(choice +
+                                                                       1) *
+                                                       self.num_views]
+                    results[key] = select_results
+            for key in ['ego2global']:
+                if key in results:
+                    select_results = []
+                    for choice in choices:
+                        select_results += [results[key][choice]]
+                    results[key] = select_results
+            # Transform lidar2cam to
+            # [cur_lidar]2[prev_img] and [cur_lidar]2[prev_cam]
+            for key in ['lidar2cam']:
+                if key in results:
+                    # only change matrices of previous frames
+                    for choice_idx in range(1, len(choices)):
+                        pad_prev_ego2global = np.eye(4)
+                        prev_ego2global = results['ego2global'][choice_idx]
+                        pad_prev_ego2global[:prev_ego2global.
+                                            shape[0], :prev_ego2global.
+                                            shape[1]] = prev_ego2global
+                        pad_cur_ego2global = np.eye(4)
+                        cur_ego2global = results['ego2global'][0]
+                        pad_cur_ego2global[:cur_ego2global.
+                                           shape[0], :cur_ego2global.
+                                           shape[1]] = cur_ego2global
+                        cur2prev = np.linalg.inv(pad_prev_ego2global).dot(
+                            pad_cur_ego2global)
+                        for result_idx in range(choice_idx * self.num_views,
+                                                (choice_idx + 1) *
+                                                self.num_views):
+                            results[key][result_idx] = \
+                                results[key][result_idx].dot(cur2prev)
+        # Support multi-view images with different shapes
+        # TODO: record the origin shape and padded shape
+        filename, cam2img, lidar2cam = [], [], []
+        for _, cam_item in results['images'].items():
+            filename.append(cam_item['img_path'])
+            cam2img.append(cam_item['cam2img'])
+            lidar2cam.append(cam_item['lidar2cam'])
+        results['filename'] = filename
+        results['cam2img'] = cam2img
+        results['lidar2cam'] = lidar2cam
+        results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
        # img is of shape (h, w, c, num_views)
-        img = np.stack(
+        # h and w can be different for different views
-            [mmcv.imread(name, self.color_type) for name in filename], axis=-1)
+        img_bytes = [self.file_client.get(name) for name in filename]
+        imgs = [
+            mmcv.imfrombytes(img_byte, flag=self.color_type)
+            for img_byte in img_bytes
+        ]
+        # handle the image with different shape
+        img_shapes = np.stack([img.shape for img in imgs], axis=0)
+        img_shape_max = np.max(img_shapes, axis=0)
+        img_shape_min = np.min(img_shapes, axis=0)
+        assert img_shape_min[-1] == img_shape_max[-1]
+        if not np.all(img_shape_max == img_shape_min):
+            pad_shape = img_shape_max[:2]
+        else:
+            pad_shape = None
+        if pad_shape is not None:
+            imgs = [
+                mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
+            ]
+        img = np.stack(imgs, axis=-1)
        if self.to_float32:
            img = img.astype(np.float32)
        results['filename'] = filename
-        # unravel to list, see `DefaultFormatBundle` in formatting.py
+        # unravel to list, see `DefaultFormatBundle` in formating.py
        # which will transpose each image separately and then stack into array
        results['img'] = [img[..., i] for i in range(img.shape[-1])]
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        # Set initial values for default meta_keys
        results['pad_shape'] = img.shape
-        results['scale_factor'] = 1.0
+        if self.set_default_scale:
+            results['scale_factor'] = 1.0
        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
        results['img_norm_cfg'] = dict(
            mean=np.zeros(num_channels, dtype=np.float32),
            std=np.ones(num_channels, dtype=np.float32),
            to_rgb=False)
+        results['num_views'] = self.num_views
+        results['num_ref_frames'] = self.num_ref_frames
        return results
    def __repr__(self):
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(to_float32={self.to_float32}, '
-        repr_str += f"color_type='{self.color_type}')"
+        repr_str += f"color_type='{self.color_type}', "
+        repr_str += f'num_views={self.num_views}, '
+        repr_str += f'num_ref_frames={self.num_ref_frames}, '
+        repr_str += f'test_mode={self.test_mode})'
        return repr_str

--- a/mmdet3d/datasets/transforms/transforms_3d.py
+++ b/mmdet3d/datasets/transforms/transforms_3d.py
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -140,10 +140,10 @@ class WaymoDataset(KittiDataset):
        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
        if 'gt_bboxes' in ann_info:
            gt_bboxes = ann_info['gt_bboxes']
-            gt_labels = ann_info['gt_labels']
+            gt_bboxes_labels = ann_info['gt_bboxes_labels']
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
+            gt_bboxes_labels = np.zeros(0, dtype=np.int64)
        if 'centers_2d' in ann_info:
            centers_2d = ann_info['centers_2d']
            depths = ann_info['depths']
@@ -169,7 +169,7 @@ class WaymoDataset(KittiDataset):
            gt_bboxes_3d=gt_bboxes_3d,
            gt_labels_3d=ann_info['gt_labels_3d'],
            gt_bboxes=gt_bboxes,
-            gt_labels=gt_labels,
+            gt_bboxes_labels=gt_bboxes_labels,
            centers_2d=centers_2d,
            depths=depths)

--- a/mmdet3d/evaluation/metrics/kitti_metric.py
+++ b/mmdet3d/evaluation/metrics/kitti_metric.py
@@ -66,7 +66,8 @@ class KittiMetric(BaseMetric):
        self.default_cam_key = default_cam_key
        self.file_client_args = file_client_args
        self.default_cam_key = default_cam_key
-        allowed_metrics = ['bbox', 'img_bbox', 'mAP']
+        allowed_metrics = ['bbox', 'img_bbox', 'mAP', 'LET_mAP']
        self.metrics = metric if isinstance(metric, list) else [metric]
        for metric in self.metrics:
            if metric not in allowed_metrics:

--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
--- a/mmdet3d/models/data_preprocessors/data_preprocessor.py
+++ b/mmdet3d/models/data_preprocessors/data_preprocessor.py
@@ -14,6 +14,8 @@ from mmdet3d.registry import MODELS
 from mmdet3d.utils import OptConfigType
 from mmdet.models import DetDataPreprocessor
 from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from .utils import multiview_img_stack_batch
 @MODELS.register_module()
@@ -144,7 +146,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        data = self.collate_data(data)
        inputs, data_samples = data['inputs'], data['data_samples']
        batch_inputs = dict()
        if 'points' in inputs:
@@ -185,6 +186,23 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        return {'inputs': batch_inputs, 'data_samples': data_samples}
+    def preprocess_img(self, _batch_img):
+        # channel transform
+        if self._channel_conversion:
+            _batch_img = _batch_img[[2, 1, 0], ...]
+        # Convert to float after channel conversion to ensure
+        # efficiency
+        _batch_img = _batch_img.float()
+        # Normalization.
+        if self._enable_normalize:
+            if self.mean.shape[0] == 3:
+                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
+                    'If the mean has 3 values, the input tensor '
+                    'should in shape of (3, H, W), but got the '
+                    f'tensor with shape {_batch_img.shape}')
+            _batch_img = (_batch_img - self.mean) / self.std
+        return _batch_img
    def collate_data(self, data: dict) -> dict:
        """Copying data to the target device and Performs normalization、
        padding and bgr2rgb conversion and stack based on
@@ -203,30 +221,30 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        if 'img' in data['inputs']:
            _batch_imgs = data['inputs']['img']
            # Process data with `pseudo_collate`.
            if is_list_of(_batch_imgs, torch.Tensor):
                batch_imgs = []
+                img_dim = _batch_imgs[0].dim()
                for _batch_img in _batch_imgs:
-                    # channel transform
+                    if img_dim == 3:  # standard img
-                    if self._channel_conversion:
+                        _batch_img = self.preprocess_img(_batch_img)
-                        _batch_img = _batch_img[[2, 1, 0], ...]
+                    elif img_dim == 4:
-                    # Convert to float after channel conversion to ensure
+                        _batch_img = [
-                    # efficiency
+                            self.preprocess_img(_img) for _img in _batch_img
-                    _batch_img = _batch_img.float()
+                        ]
-                    # Normalization.
-                    if self._enable_normalize:
+                        _batch_img = torch.stack(_batch_img, dim=0)
-                        if self.mean.shape[0] == 3:
-                            assert _batch_img.dim(
-                            ) == 3 and _batch_img.shape[0] == 3, (
-                                'If the mean has 3 values, the input tensor '
-                                'should in shape of (3, H, W), but got the '
-                                f'tensor with shape {_batch_img.shape}')
-                        _batch_img = (_batch_img - self.mean) / self.std
                    batch_imgs.append(_batch_img)
                # Pad and stack Tensor.
-                batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                if img_dim == 3:
-                                         self.pad_value)
+                    batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                                             self.pad_value)
+                elif img_dim == 4:
+                    batch_imgs = multiview_img_stack_batch(
+                        batch_imgs, self.pad_size_divisor, self.pad_value)
            # Process data with `default_collate`.
            elif isinstance(_batch_imgs, torch.Tensor):
                assert _batch_imgs.dim() == 4, (
@@ -270,6 +288,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        if is_list_of(_batch_inputs, torch.Tensor):
            batch_pad_shape = []
            for ori_input in _batch_inputs:
+                if ori_input.dim() == 4:
+                    # mean multiivew input, select ont of the
+                    # image to calculate the pad shape
+                    ori_input = ori_input[0]
                pad_h = int(
                    np.ceil(ori_input.shape[1] /
                            self.pad_size_divisor)) * self.pad_size_divisor

--- a/mmdet3d/models/data_preprocessors/utils.py
+++ b/mmdet3d/models/data_preprocessors/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+import torch
+import torch.nn.functional as F
+def multiview_img_stack_batch(
+        tensor_list: List[torch.Tensor],
+        pad_size_divisor: int = 1,
+        pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """
+    Compared to the stack_batch in mmengine.model.utils,
+    multiview_img_stack_batch further handle the multiview images.
+    see diff of padded_sizes[:, :-2] = 0 vs padded_sizees[:, 0] = 0 in line 47
+    Stack multiple tensors to form a batch and pad the tensor to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
+    divisible by ``pad_size_divisor``.
+    Args:
+        tensor_list (List[Tensor]): A list of tensors with the same dim.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the shape of each dim is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need to be divisible by 32. Defaults to 1
+        pad_value (int, float): The padding value. Defaults to 0.
+    Returns:
+        Tensor: The n dim tensor.
+    """
+    assert isinstance(
+        tensor_list,
+        list), (f'Expected input type to be list, but got {type(tensor_list)}')
+    assert tensor_list, '`tensor_list` could not be an empty list'
+    assert len({
+        tensor.ndim
+        for tensor in tensor_list
+    }) == 1, (f'Expected the dimensions of all tensors must be the same, '
+              f'but got {[tensor.ndim for tensor in tensor_list]}')
+    dim = tensor_list[0].dim()
+    num_img = len(tensor_list)
+    all_sizes: torch.Tensor = torch.Tensor(
+        [tensor.shape for tensor in tensor_list])
+    max_sizes = torch.ceil(
+        torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
+    padded_sizes = max_sizes - all_sizes
+    # The first dim normally means channel,  which should not be padded.
+    padded_sizes[:, :-2] = 0
+    if padded_sizes.sum() == 0:
+        return torch.stack(tensor_list)
+    # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
+    # it means that padding the last dim with 1(left) 2(right), padding the
+    # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
+    # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
+    # and only odd index of pad should be assigned to keep padding "right" and
+    # "bottom".
+    pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
+    pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
+    batch_tensor = []
+    for idx, tensor in enumerate(tensor_list):
+        batch_tensor.append(
+            F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
+    return torch.stack(batch_tensor)
--- a/mmdet3d/models/detectors/__init__.py
+++ b/mmdet3d/models/detectors/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import Base3DDetector
 from .centerpoint import CenterPoint
+from .dfm import DfM
 from .dynamic_voxelnet import DynamicVoxelNet
 from .fcos_mono3d import FCOSMono3D
 from .groupfree3dnet import GroupFree3DNet
 from .h3dnet import H3DNet
 from .imvotenet import ImVoteNet
 from .imvoxelnet import ImVoxelNet
+from .multiview_dfm import MultiViewDfM
 from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
 from .mvx_two_stage import MVXTwoStageDetector
 from .parta2 import PartA2
@@ -19,9 +21,25 @@ from .votenet import VoteNet
 from .voxelnet import VoxelNet
 __all__ = [
-    'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
+    'Base3DDetector',
-    'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
+    'DfM',
-    'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
+    'VoxelNet',
-    'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
+    'DynamicVoxelNet',
-    'SASSD'
+    'MVXTwoStageDetector',
+    'DynamicMVXFasterRCNN',
+    'MVXFasterRCNN',
+    'MultiViewDfM',
+    'PartA2',
+    'VoteNet',
+    'H3DNet',
+    'CenterPoint',
+    'SSD3DNet',
+    'ImVoteNet',
+    'SingleStageMono3DDetector',
+    'FCOSMono3D',
+    'ImVoxelNet',
+    'GroupFree3DNet',
+    'PointRCNN',
+    'SMOKEMono3D',
+    'SASSD',
 ]
--- a/mmdet3d/models/detectors/dfm.py
+++ b/mmdet3d/models/detectors/dfm.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.ops import bbox3d2result
+from mmdet3d.utils import ConfigType
+from mmdet.models.detectors import BaseDetector
+from ..builder import build_backbone, build_head, build_neck
+@MODELS.register_module()
+class DfM(BaseDetector):
+    r"""`Monocular 3D Object Detection with Depth from Motion.
+        <https://arxiv.org/abs/2207.12988>`_.
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
+        config.
+        backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head_3d (:obj:`ConfigDict` or dict): The 3d bbox head config.
+        neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
+            for 2D object detection. Defaults to None.
+        bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
+            head config for 2D object detection. Defaults to None.
+        depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
+            head config for depth estimation in fov space. Defaults to None.
+        depth_head (:obj:`ConfigDict` or dict, optional): The depth head
+            config for depth estimation in 3D voxel projected to fov space .
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        pretrained (:obj: `ConfigDict` or dict optional): The pretrained
+            config.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 backbone_stereo: ConfigType,
+                 backbone_3d: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head_3d: ConfigType,
+                 neck_2d=None,
+                 bbox_head_2d=None,
+                 depth_head_2d=None,
+                 depth_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck)
+        if backbone_stereo is not None:
+            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
+            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
+            self.backbone_stereo = build_backbone(backbone_stereo)
+            assert self.neck.cat_img_feature == \
+                self.backbone_stereo.cat_img_feature
+            assert self.neck.sem_channels[
+                -1] == self.backbone_stereo.in_sem_channels
+        if backbone_3d is not None:
+            self.backbone_3d = build_backbone(backbone_3d)
+        if neck_3d is not None:
+            self.neck_3d = build_neck(neck_3d)
+        if neck_2d is not None:
+            self.neck_2d = build_neck(neck_2d)
+        if bbox_head_2d is not None:
+            self.bbox_head_2d = build_head(bbox_head_2d)
+        if depth_head_2d is not None:
+            self.depth_head_2d = build_head(depth_head_2d)
+        if depth_head is not None:
+            self.depth_head = build_head(depth_head)
+            self.depth_samples = self.depth_head.depth_samples
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        bbox_head_3d.update(train_cfg=train_cfg)
+        bbox_head_3d.update(test_cfg=test_cfg)
+        self.bbox_head_3d = build_head(bbox_head_3d)
+    @property
+    def with_backbone_3d(self):
+        """Whether the detector has a 3D backbone."""
+        return hasattr(self, 'backbone_3d') and self.backbone_3d is not None
+    @property
+    def with_neck_3d(self):
+        """Whether the detector has a 3D neck."""
+        return hasattr(self, 'neck_3d') and self.neck_3d is not None
+    @property
+    def with_neck_2d(self):
+        """Whether the detector has a 2D neck."""
+        return hasattr(self, 'neck_2d') and self.neck_2d is not None
+    @property
+    def with_bbox_head_2d(self):
+        """Whether the detector has a 2D detection head."""
+        return hasattr(self, 'bbox_head_2d') and self.bbox_head_2d is not None
+    @property
+    def with_depth_head_2d(self):
+        """Whether the detector has a image-based depth head."""
+        return hasattr(self,
+                       'depth_head_2d') and self.depth_head_2d is not None
+    @property
+    def with_depth_head(self):
+        """Whether the detector has a frustum-based depth head."""
+        return hasattr(self, 'depth_head') and self.depth_head is not None
+    def extract_feat(self, img, img_metas):
+        """Feature extraction for perspective-view images.
+        Args:
+            img (torch.Tensor): Images of shape [B, N, C_in, H, W].
+            img_metas (list): Image meta information. Each element corresponds
+                to a group of images. len(img_metas) == B.
+        Returns:
+            torch.Tensor: bev feature with shape [B, C_out, N_y, N_x].
+        """
+        # split input img into current and previous ones
+        batch_size, N, C_in, H, W = img.shape
+        cur_imgs = img[:, 0]
+        prev_imgs = img[:, 1]  # TODO: to support multiple prev imgs
+        # 2D backbone for feature extraction
+        cur_feats = self.backbone(cur_imgs)
+        cur_feats = [cur_imgs] + list(cur_feats)
+        prev_feats = self.backbone(prev_imgs)
+        prev_feats = [prev_imgs] + list(prev_feats)
+        # SPP module as the feature neck
+        cur_stereo_feat, cur_sem_feat = self.neck(cur_feats)
+        prev_stereo_feat, prev_sem_feat = self.neck(prev_feats)
+        # derive cur2prevs
+        cur_pose = torch.tensor(
+            [img_meta['cam2global'] for img_meta in img_metas],
+            device=img.device)[:, None, :, :]  # (B, 1, 4, 4)
+        prev_poses = []
+        for img_meta in img_metas:
+            sweep_img_metas = img_meta['sweep_img_metas']
+            prev_poses.append([
+                sweep_img_meta['cam2global']
+                for sweep_img_meta in sweep_img_metas
+            ])
+        prev_poses = torch.tensor(prev_poses, device=img.device)
+        pad_prev_cam2global = torch.eye(4)[None, None].expand(
+            batch_size, N - 1, 4, 4).to(img.device)
+        pad_prev_cam2global[:, :, :prev_poses.shape[-2], :prev_poses.
+                            shape[-1]] = prev_poses
+        pad_cur_cam2global = torch.eye(4)[None,
+                                          None].expand(batch_size, 1, 4,
+                                                       4).to(img.device)
+        pad_cur_cam2global[:, :, :cur_pose.shape[-2], :cur_pose.
+                           shape[-1]] = cur_pose
+        # (B, N-1, 4, 4) * (B, 1, 4, 4) -> (B, N-1, 4, 4)
+        # torch.linalg.solve is faster and more numerically stable
+        # than torch.matmul(torch.linalg.inv(A), B)
+        # empirical results show that torch.linalg.solve can derive
+        # almost the same result with np.linalg.inv
+        # while torch.linalg.inv can not
+        cur2prevs = torch.linalg.solve(pad_prev_cam2global, pad_cur_cam2global)
+        for meta_idx, img_meta in enumerate(img_metas):
+            img_meta['cur2prevs'] = cur2prevs[meta_idx]
+        # stereo backbone for depth estimation
+        # volume_feat: (batch_size, Cv, Nz, Ny, Nx)
+        volume_feat = self.backbone_stereo(cur_stereo_feat, prev_stereo_feat,
+                                           img_metas, cur_sem_feat)
+        # height compression
+        _, Cv, Nz, Ny, Nx = volume_feat.shape
+        bev_feat = volume_feat.view(batch_size, Cv * Nz, Ny, Nx)
+        bev_feat_prehg, bev_feat = self.neck_3d(bev_feat)
+        return bev_feat
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      depth_img=None,
+                      **kwargs):
+        """Forward function for training."""
+        bev_feat = self.extract_feat(img, img_metas)
+        outs = self.bbox_head_3d([bev_feat])
+        losses = self.bbox_head_3d.loss(*outs, gt_bboxes_3d, gt_labels_3d,
+                                        img_metas)
+        # TODO: loss_dense_depth, loss_2d, loss_imitation
+        return losses
+    def forward_test(self, img, img_metas, **kwargs):
+        """Forward of testing.
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        # not supporting aug_test for now
+        return self.simple_test(img, img_metas)
+    def simple_test(self, img, img_metas):
+        """Simple inference forward without test time augmentation."""
+        bev_feat = self.extract_feat(img, img_metas)
+        # bbox_head takes a list of feature from different levels as input
+        # so need [bev_feat]
+        outs = self.bbox_head_3d([bev_feat])
+        bbox_list = self.bbox_head_3d.get_bboxes(*outs, img_metas)
+        bbox_results = [
+            bbox3d2result(det_bboxes, det_scores, det_labels)
+            for det_bboxes, det_scores, det_labels in bbox_list
+        ]
+        # add pseudo-lidar label to each pred_dict for post-processing
+        for bbox_result in bbox_results:
+            bbox_result['pseudo_lidar'] = True
+        return bbox_results
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
--- a/mmdet3d/models/detectors/imvoxelnet.py
+++ b/mmdet3d/models/detectors/imvoxelnet.py
@@ -2,12 +2,15 @@
 from typing import List, Tuple, Union
 import torch
+from mmengine.structures import InstanceData
 from mmdet3d.models.detectors import Base3DDetector
 from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.det3d_data_sample import SampleList
-from mmdet3d.utils import ConfigType, OptConfigType
+from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
+from mmdet.models.detectors import BaseDetector
 @MODELS.register_module()
@@ -184,3 +187,64 @@ class ImVoxelNet(Base3DDetector):
        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
        results = self.bbox_head.forward(x)
        return results
+    def convert_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
--- a/mmdet3d/models/detectors/multiview_dfm.py
+++ b/mmdet3d/models/detectors/multiview_dfm.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample,
+                                                              voxel_sample)
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d.utils import get_lidar2img
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType
+from mmdet.models.detectors import BaseDetector
+from .dfm import DfM
+from .imvoxelnet import ImVoxelNet
+@MODELS.register_module()
+class MultiViewDfM(ImVoxelNet, DfM):
+    r"""Waymo challenge solution of `MV-FCOS3D++
+    <https://arxiv.org/abs/2207.12716>`_.
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
+        config.
+        backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        voxel_size (:obj:`ConfigDict` or dict): The voxel size.
+        anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
+            config.
+        neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
+            for 2D object detection. Defaults to None.
+        bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
+            head config for 2D object detection. Defaults to None.
+        depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
+            head config for depth estimation in fov space. Defaults to None.
+        depth_head (:obj:`ConfigDict` or dict, optional): The depth head
+            config for depth estimation in 3D voxel projected to fov space .
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        valid_sample (bool): Whether to filter invalid points in view
+            transformation. Defaults to True.
+        temporal_aggregate (str): Key to determine the aggregation way in
+            temporal fusion. Defaults to 'concat'.
+        transform_depth (bool): Key to determine the transformation of depth.
+            Defaults to True.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 backbone_stereo: ConfigType,
+                 backbone_3d: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head: ConfigType,
+                 voxel_size: ConfigType,
+                 anchor_generator: ConfigType,
+                 neck_2d: ConfigType = None,
+                 bbox_head_2d: ConfigType = None,
+                 depth_head_2d: ConfigType = None,
+                 depth_head: ConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 valid_sample: bool = True,
+                 temporal_aggregate: str = 'concat',
+                 transform_depth: bool = True,
+                 init_cfg: OptConfigType = None):
+        # TODO merge with DFM
+        BaseDetector.__init__(
+            self, data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        if backbone_stereo is not None:
+            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
+            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
+            self.backbone_stereo = MODELS.build(backbone_stereo)
+            assert self.neck.cat_img_feature == \
+                self.backbone_stereo.cat_img_feature
+            assert self.neck.sem_channels[
+                -1] == self.backbone_stereo.in_sem_channels
+        if backbone_3d is not None:
+            self.backbone_3d = MODELS.build(backbone_3d)
+        if neck_3d is not None:
+            self.neck_3d = MODELS.build(neck_3d)
+        if neck_2d is not None:
+            self.neck_2d = MODELS.build(neck_2d)
+        if bbox_head_2d is not None:
+            self.bbox_head_2d = MODELS.build(bbox_head_2d)
+        if depth_head_2d is not None:
+            self.depth_head_2d = MODELS.build(depth_head_2d)
+        if depth_head is not None:
+            self.depth_head = MODELS.build(depth_head)
+            self.depth_samples = self.depth_head.depth_samples
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.voxel_size = voxel_size
+        self.voxel_range = anchor_generator['ranges'][0]
+        self.n_voxels = [
+            round((self.voxel_range[3] - self.voxel_range[0]) /
+                  self.voxel_size[0]),
+            round((self.voxel_range[4] - self.voxel_range[1]) /
+                  self.voxel_size[1]),
+            round((self.voxel_range[5] - self.voxel_range[2]) /
+                  self.voxel_size[2])
+        ]
+        self.anchor_generator = TASK_UTILS.build(anchor_generator)
+        self.valid_sample = valid_sample
+        self.temporal_aggregate = temporal_aggregate
+        self.transform_depth = transform_depth
+    def extract_feat(self, batch_inputs_dict: dict,
+                     batch_data_samples: SampleList):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+        Returns:
+            torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
+        """
+        # TODO: Nt means the number of frames temporally
+        # num_views means the number of views of a frame
+        img = batch_inputs_dict['imgs']
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_size, _, C_in, H, W = img.shape
+        num_views = batch_img_metas[0]['num_views']
+        num_ref_frames = batch_img_metas[0]['num_ref_frames']
+        if num_ref_frames > 0:
+            num_frames = num_ref_frames + 1
+        else:
+            num_frames = 1
+        input_shape = img.shape[-2:]
+        # NOTE: input_shape is the largest pad_shape of the batch of images
+        for img_meta in batch_img_metas:
+            img_meta.update(input_shape=input_shape)
+        if num_ref_frames > 0:
+            cur_imgs = img[:, :num_views].reshape(-1, C_in, H, W)
+            prev_imgs = img[:, num_views:].reshape(-1, C_in, H, W)
+            cur_feats = self.backbone(cur_imgs)
+            cur_feats = self.neck(cur_feats)[0]
+            with torch.no_grad():
+                prev_feats = self.backbone(prev_imgs)
+                prev_feats = self.neck(prev_feats)[0]
+            _, C_feat, H_feat, W_feat = cur_feats.shape
+            cur_feats = cur_feats.view(batch_size, -1, C_feat, H_feat, W_feat)
+            prev_feats = prev_feats.view(batch_size, -1, C_feat, H_feat,
+                                         W_feat)
+            batch_feats = torch.cat([cur_feats, prev_feats], dim=1)
+        else:
+            batch_imgs = img.view(-1, C_in, H, W)
+            batch_feats = self.backbone(batch_imgs)
+            # TODO: support SPP module neck
+            batch_feats = self.neck(batch_feats)[0]
+            _, C_feat, H_feat, W_feat = batch_feats.shape
+            batch_feats = batch_feats.view(batch_size, -1, C_feat, H_feat,
+                                           W_feat)
+        # transform the feature to voxel & stereo space
+        transform_feats = self.feature_transformation(batch_feats,
+                                                      batch_img_metas,
+                                                      num_views, num_frames)
+        if self.with_depth_head_2d:
+            transform_feats += (batch_feats[:, :num_views], )
+        return transform_feats
+    def feature_transformation(self, batch_feats, batch_img_metas, num_views,
+                               num_frames):
+        """Feature transformation from perspective view to BEV.
+        Args:
+            batch_feats (torch.Tensor): Perspective view features of shape
+                (batch_size, num_views, C, H, W).
+            batch_img_metas (list[dict]): Image meta information. Each element
+                corresponds to a group of images. len(img_metas) == B.
+            num_views (int): Number of views.
+            num_frames (int): Number of consecutive frames.
+        Returns:
+            tuple[torch.Tensor]: Volume features and (optionally) stereo \
+            features.
+        """
+        # TODO: support more complicated 2D feature sampling
+        points = self.anchor_generator.grid_anchors(
+            [self.n_voxels[::-1]], device=batch_feats.device)[0][:, :3]
+        volumes = []
+        img_scale_factors = []
+        img_flips = []
+        img_crop_offsets = []
+        for feature, img_meta in zip(batch_feats, batch_img_metas):
+            # TODO: remove feature sampling from back
+            # TODO: support different scale_factors/flip/crop_offset for
+            # different views
+            frame_volume = []
+            frame_valid_nums = []
+            for frame_idx in range(num_frames):
+                volume = []
+                valid_flags = []
+                if isinstance(img_meta['img_shape'], list):
+                    img_shape = img_meta['img_shape'][frame_idx][:2]
+                else:
+                    img_shape = img_meta['img_shape'][:2]
+                for view_idx in range(num_views):
+                    sample_idx = frame_idx * num_views + view_idx
+                    if 'scale_factor' in img_meta:
+                        img_scale_factor = img_meta['scale_factor'][sample_idx]
+                        if isinstance(img_scale_factor, np.ndarray) and \
+                                len(img_meta['scale_factor']) >= 2:
+                            img_scale_factor = (
+                                points.new_tensor(img_scale_factor[:2]))
+                        else:
+                            img_scale_factor = (
+                                points.new_tensor(img_scale_factor))
+                    else:
+                        img_scale_factor = (1)
+                    img_flip = img_meta['flip'][sample_idx] \
+                        if 'flip' in img_meta.keys() else False
+                    img_crop_offset = (
+                        points.new_tensor(
+                            img_meta['img_crop_offset'][sample_idx])
+                        if 'img_crop_offset' in img_meta.keys() else 0)
+                    lidar2cam = points.new_tensor(
+                        img_meta['lidar2cam'][sample_idx])
+                    cam2img = points.new_tensor(
+                        img_meta['ori_cam2img'][sample_idx])
+                    # align the precision, the tensor is converted to float32
+                    lidar2img = get_lidar2img(cam2img.double(),
+                                              lidar2cam.double())
+                    lidar2img = lidar2img.float()
+                    sample_results = point_sample(
+                        img_meta,
+                        img_features=feature[sample_idx][None, ...],
+                        points=points,
+                        proj_mat=lidar2img,
+                        coord_type='LIDAR',
+                        img_scale_factor=img_scale_factor,
+                        img_crop_offset=img_crop_offset,
+                        img_flip=img_flip,
+                        img_pad_shape=img_meta['input_shape'],
+                        img_shape=img_shape,
+                        aligned=False,
+                        valid_flag=self.valid_sample)
+                    if self.valid_sample:
+                        volume.append(sample_results[0])
+                        valid_flags.append(sample_results[1])
+                    else:
+                        volume.append(sample_results)
+                    # TODO: save valid flags, more reasonable feat fusion
+                if self.valid_sample:
+                    valid_nums = torch.stack(
+                        valid_flags, dim=0).sum(0)  # (N, )
+                    volume = torch.stack(volume, dim=0).sum(0)
+                    valid_mask = valid_nums > 0
+                    volume[~valid_mask] = 0
+                    frame_valid_nums.append(valid_nums)
+                else:
+                    volume = torch.stack(volume, dim=0).mean(0)
+                frame_volume.append(volume)
+            img_scale_factors.append(img_scale_factor)
+            img_flips.append(img_flip)
+            img_crop_offsets.append(img_crop_offset)
+            if self.valid_sample:
+                if self.temporal_aggregate == 'mean':
+                    frame_volume = torch.stack(frame_volume, dim=0).sum(0)
+                    frame_valid_nums = torch.stack(
+                        frame_valid_nums, dim=0).sum(0)
+                    frame_valid_mask = frame_valid_nums > 0
+                    frame_volume[~frame_valid_mask] = 0
+                    frame_volume = frame_volume / torch.clamp(
+                        frame_valid_nums[:, None], min=1)
+                elif self.temporal_aggregate == 'concat':
+                    frame_valid_nums = torch.stack(frame_valid_nums, dim=1)
+                    frame_volume = torch.stack(frame_volume, dim=1)
+                    frame_valid_mask = frame_valid_nums > 0
+                    frame_volume[~frame_valid_mask] = 0
+                    frame_volume = (frame_volume / torch.clamp(
+                        frame_valid_nums[:, :, None], min=1)).flatten(
+                            start_dim=1, end_dim=2)
+            else:
+                frame_volume = torch.stack(frame_volume, dim=0).mean(0)
+            volumes.append(
+                frame_volume.reshape(self.n_voxels[::-1] + [-1]).permute(
+                    3, 2, 1, 0))
+        volume_feat = torch.stack(volumes)  # (B, C, N_x, N_y, N_z)
+        if self.with_backbone_3d:
+            outputs = self.backbone_3d(volume_feat)
+            volume_feat = outputs[0]
+            if self.backbone_3d.output_bev:
+                # use outputs[0] if len(outputs) == 1
+                # use outputs[1] if len(outputs) == 2
+                # TODO: unify the output formats
+                bev_feat = outputs[-1]
+        # grid_sample stereo features from the volume feature
+        # TODO: also support temporal modeling for depth head
+        if self.with_depth_head:
+            batch_stereo_feats = []
+            for batch_idx in range(volume_feat.shape[0]):
+                stereo_feat = []
+                for view_idx in range(num_views):
+                    img_scale_factor = img_scale_factors[batch_idx] \
+                        if self.transform_depth else points.new_tensor(
+                            [1., 1.])
+                    img_crop_offset = img_crop_offsets[batch_idx] \
+                        if self.transform_depth else points.new_tensor(
+                            [0., 0.])
+                    img_flip = img_flips[batch_idx] if self.transform_depth \
+                        else False
+                    img_pad_shape = img_meta['input_shape'] \
+                        if self.transform_depth else img_meta['ori_shape'][:2]
+                    lidar2cam = points.new_tensor(
+                        batch_img_metas[batch_idx]['lidar2cam'][view_idx])
+                    cam2img = points.new_tensor(
+                        img_meta[batch_idx]['lidar2cam'][view_idx])
+                    proj_mat = torch.matmul(cam2img, lidar2cam)
+                    stereo_feat.append(
+                        voxel_sample(
+                            volume_feat[batch_idx][None],
+                            voxel_range=self.voxel_range,
+                            voxel_size=self.voxel_size,
+                            depth_samples=volume_feat.new_tensor(
+                                self.depth_samples),
+                            proj_mat=proj_mat,
+                            downsample_factor=self.depth_head.
+                            downsample_factor,
+                            img_scale_factor=img_scale_factor,
+                            img_crop_offset=img_crop_offset,
+                            img_flip=img_flip,
+                            img_pad_shape=img_pad_shape,
+                            img_shape=batch_img_metas[batch_idx]['img_shape']
+                            [view_idx][:2],
+                            aligned=True))  # TODO: study the aligned setting
+                batch_stereo_feats.append(torch.cat(stereo_feat))
+            # cat (N, C, D, H, W) -> (B*N, C, D, H, W)
+            batch_stereo_feats = torch.cat(batch_stereo_feats)
+        if self.with_neck_3d:
+            if self.with_backbone_3d and self.backbone_3d.output_bev:
+                spatial_features = self.neck_3d(bev_feat)
+                # TODO: unify the outputs of neck_3d
+                volume_feat = spatial_features[1]
+            else:
+                volume_feat = self.neck_3d(volume_feat)[0]
+        # TODO: unify the output format of neck_3d
+        transform_feats = (volume_feat, )
+        if self.with_depth_head:
+            transform_feats += (batch_stereo_feats, )
+        return transform_feats
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
--- a/mmdet3d/models/layers/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/layers/fusion_layers/point_fusion.py
@@ -7,7 +7,7 @@ from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type,
-                                        points_cam2img)
+                                        points_cam2img, points_img2cam)
 from . import apply_3d_transformation
@@ -23,7 +23,8 @@ def point_sample(img_meta,
                 img_shape,
                 aligned=True,
                 padding_mode='zeros',
-                 align_corners=True):
+                 align_corners=True,
+                 valid_flag=False):
    """Obtain image features using points.
    Args:
@@ -41,12 +42,15 @@ def point_sample(img_meta,
            padding, this is necessary to obtain features in feature map.
        img_shape (tuple[int]): int tuple indicates the h & w before padding
            after scaling, this is necessary for flipping coordinates.
-        aligned (bool, optional): Whether use bilinear interpolation when
+        aligned (bool): Whether use bilinear interpolation when
            sampling image features for each point. Defaults to True.
-        padding_mode (str, optional): Padding mode when padding values for
+        padding_mode (str): Padding mode when padding values for
            features of out-of-image points. Defaults to 'zeros'.
-        align_corners (bool, optional): Whether to align corners when
+        align_corners (bool): Whether to align corners when
            sampling image features for each point. Defaults to True.
+        valid_flag (bool): Whether to filter out the points that
+            outside the image and with depth smaller than 0. Defaults to
+            False.
    Returns:
        torch.Tensor: NxC image features sampled by point coordinates.
@@ -57,7 +61,12 @@ def point_sample(img_meta,
        points, coord_type, img_meta, reverse=True)
    # project points to image coordinate
-    pts_2d = points_cam2img(points, proj_mat)
+    if valid_flag:
+        proj_pts = points_cam2img(points, proj_mat, with_depth=True)
+        pts_2d = proj_pts[..., :2]
+        depths = proj_pts[..., 2]
+    else:
+        pts_2d = points_cam2img(points, proj_mat)
    # img transformation: scale -> crop -> flip
    # the image is resized by img_scale_factor
@@ -70,13 +79,13 @@ def point_sample(img_meta,
    if img_flip:
        # by default we take it as horizontal flip
        # use img_shape before padding for flip
-        orig_h, orig_w = img_shape
+        ori_h, ori_w = img_shape
-        coor_x = orig_w - coor_x
+        coor_x = ori_w - coor_x
    h, w = img_pad_shape
-    coor_y = coor_y / h * 2 - 1
+    norm_coor_y = coor_y / h * 2 - 1
-    coor_x = coor_x / w * 2 - 1
+    norm_coor_x = coor_x / w * 2 - 1
-    grid = torch.cat([coor_x, coor_y],
+    grid = torch.cat([norm_coor_x, norm_coor_y],
                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
    # align_corner=True provides higher performance
@@ -88,6 +97,15 @@ def point_sample(img_meta,
        padding_mode=padding_mode,
        align_corners=align_corners)  # 1xCx1xN feats
+    if valid_flag:
+        # (N, )
+        valid = (coor_x.squeeze() < w) & (coor_x.squeeze() > 0) & (
+            coor_y.squeeze() < h) & (coor_y.squeeze() > 0) & (
+                depths > 0)
+        valid_features = point_features.squeeze().t()
+        valid_features[~valid] = 0
+        return valid_features, valid  # (N, C), (N,)
    return point_features.squeeze().t()
@@ -304,3 +322,94 @@ class PointFusion(BaseModule):
            align_corners=self.align_corners,
        )
        return img_pts
+def voxel_sample(voxel_features,
+                 voxel_range,
+                 voxel_size,
+                 depth_samples,
+                 proj_mat,
+                 downsample_factor,
+                 img_scale_factor,
+                 img_crop_offset,
+                 img_flip,
+                 img_pad_shape,
+                 img_shape,
+                 aligned=True,
+                 padding_mode='zeros',
+                 align_corners=True):
+    """Obtain image features using points.
+    Args:
+        voxel_features (torch.Tensor): 1 x C x Nx x Ny x Nz voxel features.
+        voxel_range (list): The range of voxel features.
+        voxel_size (:obj:`ConfigDict` or dict): The voxel size of voxel
+            features.
+        depth_samples (torch.Tensor): N depth samples in LiDAR coordinates.
+        proj_mat (torch.Tensor): ORIGINAL LiDAR2img projection matrix
+            for N views.
+        downsample_factor (int): The downsample factor in rescaling.
+        img_scale_factor (tuple[torch.Tensor]): Scale factor with shape of
+            (w_scale, h_scale).
+        img_crop_offset (tuple[torch.Tensor]): Crop offset used to crop
+            image during data augmentation with shape of (w_offset, h_offset).
+        img_flip (bool): Whether the image is flipped.
+        img_pad_shape (tuple[int]): int tuple indicates the h & w after
+            padding, this is necessary to obtain features in feature map.
+        img_shape (tuple[int]): int tuple indicates the h & w before padding
+            after scaling, this is necessary for flipping coordinates.
+        aligned (bool, optional): Whether use bilinear interpolation when
+            sampling image features for each point. Defaults to True.
+        padding_mode (str, optional): Padding mode when padding values for
+            features of out-of-image points. Defaults to 'zeros'.
+        align_corners (bool, optional): Whether to align corners when
+            sampling image features for each point. Defaults to True.
+    Returns:
+        torch.Tensor: 1xCxDxHxW frustum features sampled from voxel features.
+    """
+    # construct frustum grid
+    device = voxel_features.device
+    h, w = img_pad_shape
+    h_out = round(h / downsample_factor)
+    w_out = round(w / downsample_factor)
+    ws = (torch.linspace(0, w_out - 1, w_out) * downsample_factor).to(device)
+    hs = (torch.linspace(0, h_out - 1, h_out) * downsample_factor).to(device)
+    depths = depth_samples[::downsample_factor]
+    num_depths = len(depths)
+    ds_3d, ys_3d, xs_3d = torch.meshgrid(depths, hs, ws)
+    # grid: (D, H_out, W_out, 3) -> (D*H_out*W_out, 3)
+    grid = torch.stack([xs_3d, ys_3d, ds_3d], dim=-1).view(-1, 3)
+    # recover the coordinates in the canonical space
+    # reverse order of augmentations: flip -> crop -> scale
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        ori_h, ori_w = img_shape
+        grid[:, 0] = ori_w - grid[:, 0]
+    grid[:, :2] += img_crop_offset
+    grid[:, :2] /= img_scale_factor
+    # grid3d: (D*H_out*W_out, 3) in LiDAR coordinate system
+    grid3d = points_img2cam(grid, proj_mat)
+    # convert the 3D point coordinates to voxel coordinates
+    voxel_range = torch.tensor(voxel_range).to(device).view(1, 6)
+    voxel_size = torch.tensor(voxel_size).to(device).view(1, 3)
+    # suppose the voxel grid is generated with AlignedAnchorGenerator
+    # -0.5 given each grid is located at the center of the grid
+    # TODO: study whether here needs -0.5
+    grid3d = (grid3d - voxel_range[:, :3]) / voxel_size - 0.5
+    grid_size = (voxel_range[:, 3:] - voxel_range[:, :3]) / voxel_size
+    # normalize grid3d to (-1, 1)
+    grid3d = grid3d / grid_size * 2 - 1
+    # (x, y, z) -> (z, y, x) for grid_sampling
+    grid3d = grid3d.view(1, num_depths, h_out, w_out, 3)[..., [2, 1, 0]]
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    frustum_features = F.grid_sample(
+        voxel_features,
+        grid3d,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCxDxHxW feats
+    return frustum_features