Release v1.1.0rc1

Release v1.1.0rc1

Release v1.1.0rc1
6c03a971 · Tai-Wang · GitHub · 9611c2d0 · ca42c312 · 6c03a971
Unverified Commit 6c03a971 authored Oct 14, 2022 by Tai-Wang Committed by GitHub Oct 14, 2022
20 changed files
--- a/mmdet3d/engine/hooks/benchmark_hook.py
+++ b/mmdet3d/engine/hooks/benchmark_hook.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine.hooks import Hook
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class BenchmarkHook(Hook):
+    """A hook that logs the training speed of each epch."""
+
+    priority = 'NORMAL'
+
+    def after_train_epoch(self, runner) -> None:
+        """We use the average throughput in iterations of the entire training
+        run and skip the first 50 iterations of each epoch to skip GPU warmup
+        time.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        message_hub = runner.message_hub
+        max_iter_num = len(runner.train_dataloader)
+        speed = message_hub.get_scalar('train/time').mean(max_iter_num - 50)
+        message_hub.update_scalar('train/speed', speed)
+        runner.logger.info(
+            f'Training speed of epoch {runner.epoch + 1} is {speed} s/iter')
+
+    def after_train(self, runner) -> None:
+        """Log average training speed of entire training process.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        message_hub = runner.message_hub
+        avg_speed = message_hub.get_scalar('train/speed').mean()
+        runner.logger.info('Average training speed of entire training process'
+                           f'is {avg_speed} s/iter')
--- a/mmdet3d/engine/hooks/visualization_hook.py
+++ b/mmdet3d/engine/hooks/visualization_hook.py
@@ -4,6 +4,7 @@ import warnings
 from typing import Optional, Sequence

 import mmcv
+import numpy as np
 from mmengine.fileio import FileClient
 from mmengine.hooks import Hook
 from mmengine.runner import Runner
@@ -95,15 +96,27 @@ class Det3DVisualizationHook(Hook):
        # is visualized for each evaluation.
        total_curr_iter = runner.iter + batch_idx

+        data_input = dict()
+
        # Visualize only the first data
-        img_path = outputs[0].img_path
-        img_bytes = self.file_client.get(img_path)
-        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        if 'img_path' in outputs[0]:
+            img_path = outputs[0].img_path
+            img_bytes = self.file_client.get(img_path)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+            data_input['img'] = img
+
+        if 'lidar_path' in outputs[0]:
+            lidar_path = outputs[0].lidar_path
+            num_pts_feats = outputs[0].num_pts_feats
+            pts_bytes = self.file_client.get(lidar_path)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+            points = points.reshape(-1, num_pts_feats)
+            data_input['points'] = points

        if total_curr_iter % self.interval == 0:
            self._visualizer.add_datasample(
-                osp.basename(img_path) if self.show else 'val_img',
-                img,
+                'val sample',
+                data_input,
                data_sample=outputs[0],
                show=self.show,
                wait_time=self.wait_time,
@@ -135,9 +148,20 @@ class Det3DVisualizationHook(Hook):
        for data_sample in outputs:
            self._test_index += 1

-            img_path = data_sample.img_path
-            img_bytes = self.file_client.get(img_path)
-            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+            data_input = dict()
+            if 'img_path' in data_sample:
+                img_path = data_sample.img_path
+                img_bytes = self.file_client.get(img_path)
+                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                data_input['img'] = img
+
+            if 'lidar_path' in data_sample:
+                lidar_path = data_sample.lidar_path
+                num_pts_feats = data_sample.num_pts_feats
+                pts_bytes = self.file_client.get(lidar_path)
+                points = np.frombuffer(pts_bytes, dtype=np.float32)
+                points = points.reshape(-1, num_pts_feats)
+                data_input['points'] = points

            out_file = None
            if self.test_out_dir is not None:
@@ -145,8 +169,8 @@ class Det3DVisualizationHook(Hook):
                out_file = osp.join(self.test_out_dir, out_file)

            self._visualizer.add_datasample(
-                osp.basename(img_path) if self.show else 'test_img',
-                img,
+                'test sample',
+                data_input,
                data_sample=data_sample,
                show=self.show,
                wait_time=self.wait_time,

--- a/mmdet3d/evaluation/metrics/kitti_metric.py
+++ b/mmdet3d/evaluation/metrics/kitti_metric.py
@@ -66,7 +66,8 @@ class KittiMetric(BaseMetric):
        self.default_cam_key = default_cam_key
        self.file_client_args = file_client_args
        self.default_cam_key = default_cam_key
-        allowed_metrics = ['bbox', 'img_bbox', 'mAP']
+
+        allowed_metrics = ['bbox', 'img_bbox', 'mAP', 'LET_mAP']
        self.metrics = metric if isinstance(metric, list) else [metric]
        for metric in self.metrics:
            if metric not in allowed_metrics:
@@ -168,7 +169,7 @@ class KittiMetric(BaseMetric):
        """Compute the metrics from processed results.

        Args:
-            results (list): The processed results of each batch.
+            results (list): The processed results of the whole dataset.

        Returns:
            Dict[str, float]: The computed metrics. The keys are the names of
@@ -575,7 +576,7 @@ class KittiMetric(BaseMetric):
        box_preds = box_dict['bboxes_3d']
        scores = box_dict['scores_3d']
        labels = box_dict['labels_3d']
-        sample_idx = info['sample_id']
+        sample_idx = info['sample_idx']
        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)

        if len(box_preds) == 0:

--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -11,8 +11,9 @@ from mmengine.logging import MMLogger, print_log

 from mmdet3d.models.layers import box3d_multiclass_nms
 from mmdet3d.registry import METRICS
-from mmdet3d.structures import (Box3DMode, LiDARInstance3DBoxes, bbox3d2result,
-                                xywhr2xyxyr)
+from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
+                                LiDARInstance3DBoxes, bbox3d2result,
+                                points_cam2img, xywhr2xyxyr)
 from .kitti_metric import KittiMetric


@@ -27,7 +28,7 @@ class WaymoMetric(KittiMetric):
                         Used for storing waymo evaluation programs.
        split (str): The split of the evaluation set.
        metric (str | list[str]): Metrics to be evaluated.
-            Default to 'bbox'.
+            Default to 'mAP'.
        pcd_limit_range (list): The range of point cloud used to
            filter invalid predicted boxes.
            Default to [0, -40, -3, 70.4, 40, 0.0].
@@ -54,13 +55,14 @@ class WaymoMetric(KittiMetric):
            'gpu'. Defaults to 'cpu'.
        file_client_args (dict): file client for reading gt in waymo format.
    """
+    num_cams = 5

    def __init__(self,
                 ann_file: str,
                 waymo_bin_file: str,
                 data_root: str,
                 split: str = 'training',
-                 metric: Union[str, List[str]] = 'bbox',
+                 metric: Union[str, List[str]] = 'mAP',
                 pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
                 prefix: Optional[str] = None,
                 pklfile_prefix: str = None,
@@ -70,7 +72,6 @@ class WaymoMetric(KittiMetric):
                 use_pred_sample_idx: bool = False,
                 collect_device: str = 'cpu',
                 file_client_args: dict = dict(backend='disk')):
-
        self.waymo_bin_file = waymo_bin_file
        self.data_root = data_root
        self.split = split
@@ -92,7 +93,7 @@ class WaymoMetric(KittiMetric):
        """Compute the metrics from processed results.

        Args:
-            results (list): The processed results of each batch.
+            results (list): The processed results of the whole dataset.

        Returns:
            Dict[str, float]: The computed metrics. The keys are the names of
@@ -104,6 +105,35 @@ class WaymoMetric(KittiMetric):
        # load annotations
        self.data_infos = load(self.ann_file)['data_list']
        # different from kitti, waymo do not need to convert the ann file
+        # handle the mono3d task
+        if self.task == 'mono3d':
+            new_data_infos = []
+            for info in self.data_infos:
+                height = info['images'][self.default_cam_key]['height']
+                width = info['images'][self.default_cam_key]['width']
+                for (cam_key, img_info) in info['images'].items():
+                    camera_info = dict()
+                    camera_info['images'] = dict()
+                    camera_info['images'][cam_key] = img_info
+                    # TODO remove the check by updating the data info;
+                    if 'height' not in img_info:
+                        img_info['height'] = height
+                        img_info['width'] = width
+                    if 'cam_instances' in info \
+                            and cam_key in info['cam_instances']:
+                        camera_info['instances'] = info['cam_instances'][
+                            cam_key]
+                    else:
+                        camera_info['instances'] = []
+                    camera_info['ego2global'] = info['ego2global']
+                    if 'image_sweeps' in info:
+                        camera_info['image_sweeps'] = info['image_sweeps']
+
+                    # TODO check if need to modify the sample id
+                    # TODO check when will use it except for evaluation.
+                    camera_info['sample_id'] = info['sample_id']
+                    new_data_infos.append(camera_info)
+            self.data_infos = new_data_infos

        if self.pklfile_prefix is None:
            eval_tmp_dir = tempfile.TemporaryDirectory()
@@ -120,65 +150,141 @@ class WaymoMetric(KittiMetric):
            submission_prefix=self.submission_prefix,
            classes=self.classes)

-        import subprocess
-        eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
-            f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
-            f'{self.waymo_bin_file}'
-        print(eval_str)
-        ret_bytes = subprocess.check_output(eval_str, shell=True)
-        ret_texts = ret_bytes.decode('utf-8')
-        print_log(ret_texts, logger=logger)
-
-        ap_dict = {
-            'Vehicle/L1 mAP': 0,
-            'Vehicle/L1 mAPH': 0,
-            'Vehicle/L2 mAP': 0,
-            'Vehicle/L2 mAPH': 0,
-            'Pedestrian/L1 mAP': 0,
-            'Pedestrian/L1 mAPH': 0,
-            'Pedestrian/L2 mAP': 0,
-            'Pedestrian/L2 mAPH': 0,
-            'Sign/L1 mAP': 0,
-            'Sign/L1 mAPH': 0,
-            'Sign/L2 mAP': 0,
-            'Sign/L2 mAPH': 0,
-            'Cyclist/L1 mAP': 0,
-            'Cyclist/L1 mAPH': 0,
-            'Cyclist/L2 mAP': 0,
-            'Cyclist/L2 mAPH': 0,
-            'Overall/L1 mAP': 0,
-            'Overall/L1 mAPH': 0,
-            'Overall/L2 mAP': 0,
-            'Overall/L2 mAPH': 0
-        }
-        mAP_splits = ret_texts.split('mAP ')
-        mAPH_splits = ret_texts.split('mAPH ')
-        mAP_splits = ret_texts.split('mAP ')
-        mAPH_splits = ret_texts.split('mAPH ')
-        for idx, key in enumerate(ap_dict.keys()):
-            split_idx = int(idx / 2) + 1
-            if idx % 2 == 0:  # mAP
-                ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
-            else:  # mAPH
-                ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
-        ap_dict['Overall/L1 mAP'] = \
-            (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
-                ap_dict['Cyclist/L1 mAP']) / 3
-        ap_dict['Overall/L1 mAPH'] = \
-            (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
-                ap_dict['Cyclist/L1 mAPH']) / 3
-        ap_dict['Overall/L2 mAP'] = \
-            (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
-                ap_dict['Cyclist/L2 mAP']) / 3
-        ap_dict['Overall/L2 mAPH'] = \
-            (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
-                ap_dict['Cyclist/L2 mAPH']) / 3
+        metric_dict = {}
+        for metric in self.metrics:
+            ap_dict = self.waymo_evaluate(
+                pklfile_prefix, metric=metric, logger=logger)
+            metric_dict[metric] = ap_dict
        if eval_tmp_dir is not None:
            eval_tmp_dir.cleanup()

        if tmp_dir is not None:
            tmp_dir.cleanup()
+        return metric_dict

+    def waymo_evaluate(self,
+                       pklfile_prefix: str,
+                       metric: str = None,
+                       logger: MMLogger = None) -> dict:
+        """Evaluation in Waymo protocol.
+
+        Args:
+            pklfile_prefix (str): The location that stored the prediction
+                results.
+            metric (str): Metric to be evaluated. Defaults to None.
+            logger (MMLogger, optional): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        import subprocess
+
+        if metric == 'mAP':
+            eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
+                f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
+                f'{self.waymo_bin_file}'
+            print(eval_str)
+            ret_bytes = subprocess.check_output(
+                'mmdet3d/evaluation/functional/waymo_utils/' +
+                f'compute_detection_metrics_main {pklfile_prefix}.bin ' +
+                f'{self.waymo_bin_file}',
+                shell=True)
+            ret_texts = ret_bytes.decode('utf-8')
+            print_log(ret_texts, logger=logger)
+
+            ap_dict = {
+                'Vehicle/L1 mAP': 0,
+                'Vehicle/L1 mAPH': 0,
+                'Vehicle/L2 mAP': 0,
+                'Vehicle/L2 mAPH': 0,
+                'Pedestrian/L1 mAP': 0,
+                'Pedestrian/L1 mAPH': 0,
+                'Pedestrian/L2 mAP': 0,
+                'Pedestrian/L2 mAPH': 0,
+                'Sign/L1 mAP': 0,
+                'Sign/L1 mAPH': 0,
+                'Sign/L2 mAP': 0,
+                'Sign/L2 mAPH': 0,
+                'Cyclist/L1 mAP': 0,
+                'Cyclist/L1 mAPH': 0,
+                'Cyclist/L2 mAP': 0,
+                'Cyclist/L2 mAPH': 0,
+                'Overall/L1 mAP': 0,
+                'Overall/L1 mAPH': 0,
+                'Overall/L2 mAP': 0,
+                'Overall/L2 mAPH': 0
+            }
+            mAP_splits = ret_texts.split('mAP ')
+            mAPH_splits = ret_texts.split('mAPH ')
+            mAP_splits = ret_texts.split('mAP ')
+            mAPH_splits = ret_texts.split('mAPH ')
+            for idx, key in enumerate(ap_dict.keys()):
+                split_idx = int(idx / 2) + 1
+                if idx % 2 == 0:  # mAP
+                    ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
+                else:  # mAPH
+                    ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
+            ap_dict['Overall/L1 mAP'] = \
+                (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
+                    ap_dict['Cyclist/L1 mAP']) / 3
+            ap_dict['Overall/L1 mAPH'] = \
+                (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
+                    ap_dict['Cyclist/L1 mAPH']) / 3
+            ap_dict['Overall/L2 mAP'] = \
+                (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
+                    ap_dict['Cyclist/L2 mAP']) / 3
+            ap_dict['Overall/L2 mAPH'] = \
+                (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
+                    ap_dict['Cyclist/L2 mAPH']) / 3
+        elif metric == 'LET_mAP':
+            eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
+                f'compute_detection_let_metrics_main {pklfile_prefix}.bin ' + \
+                f'{self.waymo_bin_file}'
+
+            print(eval_str)
+            ret_bytes = subprocess.check_output(eval_str, shell=True)
+            ret_texts = ret_bytes.decode('utf-8')
+
+            print_log(ret_texts, logger=logger)
+            ap_dict = {
+                'Vehicle mAPL': 0,
+                'Vehicle mAP': 0,
+                'Vehicle mAPH': 0,
+                'Pedestrian mAPL': 0,
+                'Pedestrian mAP': 0,
+                'Pedestrian mAPH': 0,
+                'Sign mAPL': 0,
+                'Sign mAP': 0,
+                'Sign mAPH': 0,
+                'Cyclist mAPL': 0,
+                'Cyclist mAP': 0,
+                'Cyclist mAPH': 0,
+                'Overall mAPL': 0,
+                'Overall mAP': 0,
+                'Overall mAPH': 0
+            }
+            mAPL_splits = ret_texts.split('mAPL ')
+            mAP_splits = ret_texts.split('mAP ')
+            mAPH_splits = ret_texts.split('mAPH ')
+            for idx, key in enumerate(ap_dict.keys()):
+                split_idx = int(idx / 3) + 1
+                if idx % 3 == 0:  # mAPL
+                    ap_dict[key] = float(mAPL_splits[split_idx].split(']')[0])
+                elif idx % 3 == 1:  # mAP
+                    ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
+                else:  # mAPH
+                    ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
+            ap_dict['Overall mAPL'] = \
+                (ap_dict['Vehicle mAPL'] + ap_dict['Pedestrian mAPL'] +
+                    ap_dict['Cyclist mAPL']) / 3
+            ap_dict['Overall mAP'] = \
+                (ap_dict['Vehicle mAP'] + ap_dict['Pedestrian mAP'] +
+                    ap_dict['Cyclist mAP']) / 3
+            ap_dict['Overall mAPH'] = \
+                (ap_dict['Vehicle mAPH'] + ap_dict['Pedestrian mAPH'] +
+                    ap_dict['Cyclist mAPH']) / 3
        return ap_dict

    def format_results(self,
@@ -254,7 +360,7 @@ class WaymoMetric(KittiMetric):
            for cam_idx in range(self.num_cams):
                box_dict[key].append(box_dict_per_frame[cam_idx][key])
        # merge each elements
-        box_dict['sample_id'] = cam0_info['image_id']
+        box_dict['sample_idx'] = cam0_info['image_id']
        for key in ['bbox', 'box3d_lidar', 'scores', 'label_preds']:
            box_dict[key] = np.concatenate(box_dict[key])

@@ -284,14 +390,14 @@ class WaymoMetric(KittiMetric):
            nms_cfg.max_per_frame, nms_cfg)
        lidar_boxes3d = LiDARInstance3DBoxes(boxes3d)
        det = bbox3d2result(lidar_boxes3d, scores, labels)
-        box_preds_lidar = det['boxes_3d']
+        box_preds_lidar = det['bboxes_3d']
        scores = det['scores_3d']
        labels = det['labels_3d']
        # box_preds_camera is in the cam0 system
-        rect = cam0_info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = cam0_info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
+        lidar2cam = np.array(lidar2cam).astype(np.float32)
        box_preds_camera = box_preds_lidar.convert_to(
-            Box3DMode.CAM, rect @ Trv2c, correct_yaw=True)
+            Box3DMode.CAM, np.linalg.inv(lidar2cam), correct_yaw=True)
        # Note: bbox is meaningless in final evaluation, set to 0
        merged_box_dict = dict(
            bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
@@ -299,7 +405,7 @@ class WaymoMetric(KittiMetric):
            box3d_lidar=box_preds_lidar.tensor.numpy(),
            scores=scores.numpy(),
            label_preds=labels.numpy(),
-            sample_idx=box_dict['sample_idx'],
+            sample_idx=box_dict['sample_id'],
        )
        return merged_box_dict

@@ -337,23 +443,31 @@ class WaymoMetric(KittiMetric):
            annos = []
            sample_idx = sample_id_list[idx]
            info = self.data_infos[sample_idx]
-            # Here default used 'CAM2' to compute metric. If you want to
-            # use another camera, please modify it.
-            image_shape = (info['images'][self.default_cam_key]['height'],
-                           info['images'][self.default_cam_key]['width'])

-            if self.task == 'mono3d':
+            if self.task == 'mono_det':
                if idx % self.num_cams == 0:
                    box_dict_per_frame = []
-                    cam0_idx = idx
-            box_dict = self.convert_valid_bboxes(pred_dicts, info)
-
+                    cam0_key = list(info['images'].keys())[0]
+                    cam0_info = info
+                    # Here in mono3d, we use the 'CAM_FRONT' "the first
+                    # index in the camera" as the default image shape.
+                    # If you want to another camera, please modify it.
+                    image_shape = (info['images'][cam0_key]['height'],
+                                   info['images'][cam0_key]['width'])
+                box_dict = self.convert_valid_bboxes(pred_dicts, info)
+            else:
+                box_dict = self.convert_valid_bboxes(pred_dicts, info)
+                # Here default used 'CAM_FRONT' to compute metric.
+                # If you want to use another camera, please modify it.
+                image_shape = (info['images'][self.default_cam_key]['height'],
+                               info['images'][self.default_cam_key]['width'])
            if self.task == 'mono3d':
                box_dict_per_frame.append(box_dict)
                if (idx + 1) % self.num_cams != 0:
                    continue
-                box_dict = self.merge_multi_view_boxes(
-                    box_dict_per_frame, self.data_infos[cam0_idx])
+                box_dict = self.merge_multi_view_boxes(box_dict_per_frame,
+                                                       cam0_info)
+
            anno = {
                'name': [],
                'truncated': [],
@@ -444,3 +558,106 @@ class WaymoMetric(KittiMetric):
            print(f'Result is saved to {out}.')

        return det_annos
+
+    def convert_valid_bboxes(self, box_dict: dict, info: dict):
+        """Convert the predicted boxes into valid ones. Should handle the
+        different task mode (mono3d, mv3d, lidar), separately.
+
+        Args:
+            box_dict (dict): Box dictionaries to be converted.
+
+                - boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
+                - scores_3d (torch.Tensor): Scores of boxes.
+                - labels_3d (torch.Tensor): Class labels of boxes.
+            info (dict): Data info.
+
+        Returns:
+            dict: Valid predicted boxes.
+
+                - bbox (np.ndarray): 2D bounding boxes.
+                - box3d_camera (np.ndarray): 3D bounding boxes in
+                    camera coordinate.
+                - box3d_lidar (np.ndarray): 3D bounding boxes in
+                    LiDAR coordinate.
+                - scores (np.ndarray): Scores of boxes.
+                - label_preds (np.ndarray): Class label predictions.
+                - sample_idx (int): Sample index.
+        """
+        # TODO: refactor this function
+        box_preds = box_dict['bboxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
+        sample_idx = info['sample_id']
+        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
+
+        if len(box_preds) == 0:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+        # Here default used 'CAM2' to compute metric. If you want to
+        # use another camera, please modify it.
+        if self.task in ['mv3d', 'lidar']:
+            cam_key = self.default_cam_key
+        elif self.task == 'mono3d':
+            cam_key = list(info['images'].keys())[0]
+        else:
+            raise NotImplementedError
+
+        lidar2cam = np.array(info['images'][cam_key]['lidar2cam']).astype(
+            np.float32)
+        P2 = np.array(info['images'][cam_key]['cam2img']).astype(np.float32)
+        img_shape = (info['images'][cam_key]['height'],
+                     info['images'][cam_key]['width'])
+        P2 = box_preds.tensor.new_tensor(P2)
+
+        if isinstance(box_preds, LiDARInstance3DBoxes):
+            box_preds_camera = box_preds.convert_to(Box3DMode.CAM, lidar2cam)
+            box_preds_lidar = box_preds
+        elif isinstance(box_preds, CameraInstance3DBoxes):
+            box_preds_camera = box_preds
+            box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
+                                                   np.linalg.inv(lidar2cam))
+
+        box_corners = box_preds_camera.corners
+        box_corners_in_image = points_cam2img(box_corners, P2)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check box_preds_camera
+        image_shape = box_preds.tensor.new_tensor(img_shape)
+        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
+                          (box_2d_preds[:, 1] < image_shape[0]) &
+                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
+        # check box_preds_lidar
+        if self.task in ['lidar', 'mono3d']:
+            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
+            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
+                              (box_preds_lidar.center < limit_range[3:]))
+            valid_inds = valid_pcd_inds.all(-1)
+        elif self.task == 'mono3d':
+            valid_inds = valid_cam_inds
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                pred_box_type_3d=type(box_preds),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
+                box3d_lidar=box_preds_lidar[valid_inds].tensor.numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
+                sample_idx=sample_idx)
+        else:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                pred_box_type_3d=type(box_preds),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0]),
+                sample_idx=sample_idx)
--- a/mmdet3d/models/builder.py
+++ b/mmdet3d/models/builder.py
@@ -92,7 +92,7 @@ def build_segmentor(cfg, train_cfg=None, test_cfg=None):


 def build_model(cfg, train_cfg=None, test_cfg=None):
-    """A function warpper for building 3D detector or segmentor according to
+    """A function wrapper for building 3D detector or segmentor according to
    cfg.

    Should be deprecated in the future.

--- a/mmdet3d/models/data_preprocessors/data_preprocessor.py
+++ b/mmdet3d/models/data_preprocessors/data_preprocessor.py
@@ -13,7 +13,7 @@ from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.utils import OptConfigType
 from mmdet.models import DetDataPreprocessor
-from mmdet.models.utils.misc import samplelist_boxlist2tensor
+from .utils import multiview_img_stack_batch


 @MODELS.register_module()
@@ -75,7 +75,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
                 seg_pad_value: int = 255,
                 bgr_to_rgb: bool = False,
                 rgb_to_bgr: bool = False,
-                 boxlist2tensor: bool = True,
+                 boxtype2tensor: bool = True,
                 batch_augments: Optional[List[dict]] = None):
        super().__init__(
            mean=mean,
@@ -88,7 +88,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            seg_pad_value=seg_pad_value,
            bgr_to_rgb=bgr_to_rgb,
            rgb_to_bgr=rgb_to_bgr,
-            boxlist2tensor=boxlist2tensor,
            batch_augments=batch_augments)
        self.voxel = voxel
        self.voxel_type = voxel_type
@@ -104,10 +103,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        ``BaseDataPreprocessor``.

        Args:
-            data (List[dict] | List[List[dict]]): data from dataloader.
-                The outer list always represent the batch size, when it is
-                a list[list[dict]], the inter list indicate test time
-                augmentation.
+            data (dict | List[dict]): data from dataloader.
+                The dict contains the whole batch data, when it is
+                a list[dict], the list indicate test time augmentation.
+
            training (bool): Whether to enable training time augmentation.
                Defaults to False.

@@ -144,7 +143,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor):

        data = self.collate_data(data)
        inputs, data_samples = data['inputs'], data['data_samples']
-
        batch_inputs = dict()

        if 'points' in inputs:
@@ -169,9 +167,14 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
                        'pad_shape': pad_shape
                    })

-                if self.boxlist2tensor:
+                if hasattr(self, 'boxtype2tensor') and self.boxtype2tensor:
+                    from mmdet.models.utils.misc import \
+                        samplelist_boxtype2tensor
+                    samplelist_boxtype2tensor(data_samples)
+                elif hasattr(self, 'boxlist2tensor') and self.boxlist2tensor:
+                    from mmdet.models.utils.misc import \
+                        samplelist_boxlist2tensor
                    samplelist_boxlist2tensor(data_samples)
-
                if self.pad_mask:
                    self.pad_gt_masks(data_samples)

@@ -185,6 +188,23 @@ class Det3DDataPreprocessor(DetDataPreprocessor):

        return {'inputs': batch_inputs, 'data_samples': data_samples}

+    def preprocess_img(self, _batch_img):
+        # channel transform
+        if self._channel_conversion:
+            _batch_img = _batch_img[[2, 1, 0], ...]
+        # Convert to float after channel conversion to ensure
+        # efficiency
+        _batch_img = _batch_img.float()
+        # Normalization.
+        if self._enable_normalize:
+            if self.mean.shape[0] == 3:
+                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
+                    'If the mean has 3 values, the input tensor '
+                    'should in shape of (3, H, W), but got the '
+                    f'tensor with shape {_batch_img.shape}')
+            _batch_img = (_batch_img - self.mean) / self.std
+        return _batch_img
+
    def collate_data(self, data: dict) -> dict:
        """Copying data to the target device and Performs normalization、
        padding and bgr2rgb conversion and stack based on
@@ -203,30 +223,30 @@ class Det3DDataPreprocessor(DetDataPreprocessor):

        if 'img' in data['inputs']:
            _batch_imgs = data['inputs']['img']
-
            # Process data with `pseudo_collate`.
            if is_list_of(_batch_imgs, torch.Tensor):
                batch_imgs = []
+                img_dim = _batch_imgs[0].dim()
                for _batch_img in _batch_imgs:
-                    # channel transform
-                    if self._channel_conversion:
-                        _batch_img = _batch_img[[2, 1, 0], ...]
-                    # Convert to float after channel conversion to ensure
-                    # efficiency
-                    _batch_img = _batch_img.float()
-                    # Normalization.
-                    if self._enable_normalize:
-                        if self.mean.shape[0] == 3:
-                            assert _batch_img.dim(
-                            ) == 3 and _batch_img.shape[0] == 3, (
-                                'If the mean has 3 values, the input tensor '
-                                'should in shape of (3, H, W), but got the '
-                                f'tensor with shape {_batch_img.shape}')
-                        _batch_img = (_batch_img - self.mean) / self.std
+                    if img_dim == 3:  # standard img
+                        _batch_img = self.preprocess_img(_batch_img)
+                    elif img_dim == 4:
+                        _batch_img = [
+                            self.preprocess_img(_img) for _img in _batch_img
+                        ]
+
+                        _batch_img = torch.stack(_batch_img, dim=0)
+
                    batch_imgs.append(_batch_img)
+
                # Pad and stack Tensor.
-                batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
-                                         self.pad_value)
+                if img_dim == 3:
+                    batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                                             self.pad_value)
+                elif img_dim == 4:
+                    batch_imgs = multiview_img_stack_batch(
+                        batch_imgs, self.pad_size_divisor, self.pad_value)
+
            # Process data with `default_collate`.
            elif isinstance(_batch_imgs, torch.Tensor):
                assert _batch_imgs.dim() == 4, (
@@ -270,6 +290,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        if is_list_of(_batch_inputs, torch.Tensor):
            batch_pad_shape = []
            for ori_input in _batch_inputs:
+                if ori_input.dim() == 4:
+                    # mean multiivew input, select ont of the
+                    # image to calculate the pad shape
+                    ori_input = ori_input[0]
                pad_h = int(
                    np.ceil(ori_input.shape[1] /
                            self.pad_size_divisor)) * self.pad_size_divisor
@@ -293,7 +317,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        else:
            raise TypeError('Output of `cast_data` should be a list of dict '
                            'or a tuple with inputs and data_samples, but got'
-                            f'{type(data)}： {data}')
+                            f'{type(data)}: {data}')
        return batch_pad_shape

    @torch.no_grad()

--- a/mmdet3d/models/data_preprocessors/utils.py
+++ b/mmdet3d/models/data_preprocessors/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+import torch.nn.functional as F
+
+
+def multiview_img_stack_batch(
+        tensor_list: List[torch.Tensor],
+        pad_size_divisor: int = 1,
+        pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """
+    Compared to the stack_batch in mmengine.model.utils,
+    multiview_img_stack_batch further handle the multiview images.
+    see diff of padded_sizes[:, :-2] = 0 vs padded_sizees[:, 0] = 0 in line 47
+    Stack multiple tensors to form a batch and pad the tensor to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
+    divisible by ``pad_size_divisor``.
+
+    Args:
+        tensor_list (List[Tensor]): A list of tensors with the same dim.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the shape of each dim is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need to be divisible by 32. Defaults to 1
+        pad_value (int, float): The padding value. Defaults to 0.
+
+    Returns:
+        Tensor: The n dim tensor.
+    """
+    assert isinstance(
+        tensor_list,
+        list), (f'Expected input type to be list, but got {type(tensor_list)}')
+    assert tensor_list, '`tensor_list` could not be an empty list'
+    assert len({
+        tensor.ndim
+        for tensor in tensor_list
+    }) == 1, (f'Expected the dimensions of all tensors must be the same, '
+              f'but got {[tensor.ndim for tensor in tensor_list]}')
+
+    dim = tensor_list[0].dim()
+    num_img = len(tensor_list)
+    all_sizes: torch.Tensor = torch.Tensor(
+        [tensor.shape for tensor in tensor_list])
+    max_sizes = torch.ceil(
+        torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
+    padded_sizes = max_sizes - all_sizes
+    # The first dim normally means channel,  which should not be padded.
+    padded_sizes[:, :-2] = 0
+    if padded_sizes.sum() == 0:
+        return torch.stack(tensor_list)
+    # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
+    # it means that padding the last dim with 1(left) 2(right), padding the
+    # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
+    # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
+    # and only odd index of pad should be assigned to keep padding "right" and
+    # "bottom".
+    pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
+    pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
+    batch_tensor = []
+    for idx, tensor in enumerate(tensor_list):
+        batch_tensor.append(
+            F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
+    return torch.stack(batch_tensor)
--- a/mmdet3d/models/dense_heads/base_3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_3d_dense_head.py
@@ -204,7 +204,7 @@ class Base3DDenseHead(BaseModule, metaclass=ABCMeta):
            score_factors (list[Tensor], optional): Score factor for
                all scale level, each is a 4D-tensor, has shape
                (batch_size, num_priors * 1, H, W). Defaults to None.
-            batch_input_metas (list[dict], Optional): Batch image meta info.
+            batch_input_metas (list[dict], Optional): Batch inputs meta info.
                Defaults to None.
            cfg (ConfigDict, optional): Test / postprocessing
                configuration, if None, test_cfg would be used.

--- a/mmdet3d/models/dense_heads/parta2_rpn_head.py
+++ b/mmdet3d/models/dense_heads/parta2_rpn_head.py
@@ -183,8 +183,7 @@ class PartA2RPNHead(Anchor3DHead):
        result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
                                         mlvl_max_scores, mlvl_label_pred,
                                         mlvl_cls_score, mlvl_dir_scores,
-                                         score_thr, cfg.nms_post, cfg,
-                                         input_meta)
+                                         score_thr, cfg, input_meta)
        return result

    def loss_and_predict(self,
@@ -275,7 +274,7 @@ class PartA2RPNHead(Anchor3DHead):
                           mlvl_bboxes_for_nms: Tensor,
                           mlvl_max_scores: Tensor, mlvl_label_pred: Tensor,
                           mlvl_cls_score: Tensor, mlvl_dir_scores: Tensor,
-                           score_thr: int, max_num: int, cfg: ConfigDict,
+                           score_thr: int, cfg: ConfigDict,
                           input_meta: dict) -> Dict:
        """Class agnostic nms for single batch.

@@ -291,7 +290,6 @@ class PartA2RPNHead(Anchor3DHead):
            mlvl_dir_scores (torch.Tensor): Direction scores of
                Multi-level bbox.
            score_thr (int): Score threshold.
-            max_num (int): Max number of bboxes after nms.
            cfg (:obj:`ConfigDict`): Training or testing config.
            input_meta (dict): Contain pcd and img's meta info.

@@ -339,9 +337,9 @@ class PartA2RPNHead(Anchor3DHead):
            scores = torch.cat(scores, dim=0)
            cls_scores = torch.cat(cls_scores, dim=0)
            labels = torch.cat(labels, dim=0)
-            if bboxes.shape[0] > max_num:
+            if bboxes.shape[0] > cfg.nms_post:
                _, inds = scores.sort(descending=True)
-                inds = inds[:max_num]
+                inds = inds[:cfg.nms_post]
                bboxes = bboxes[inds, :]
                labels = labels[inds]
                scores = scores[inds]

--- a/mmdet3d/models/dense_heads/point_rpn_head.py
+++ b/mmdet3d/models/dense_heads/point_rpn_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
 import torch
 from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
 from torch import nn as nn

-from mmdet3d.models.builder import build_loss
 from mmdet3d.models.layers import nms_bev, nms_normal_bev
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures import xywhr2xyxyr
-from mmdet3d.structures.bbox_3d import (DepthInstance3DBoxes,
+from mmdet3d.structures.bbox_3d import (BaseInstance3DBoxes,
+                                        DepthInstance3DBoxes,
                                        LiDARInstance3DBoxes)
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing import InstanceList
 from mmdet.models.utils import multi_apply


@@ -34,15 +40,15 @@ class PointRPNHead(BaseModule):
    """

    def __init__(self,
-                 num_classes,
-                 train_cfg,
-                 test_cfg,
-                 pred_layer_cfg=None,
-                 enlarge_width=0.1,
-                 cls_loss=None,
-                 bbox_loss=None,
-                 bbox_coder=None,
-                 init_cfg=None):
+                 num_classes: int,
+                 train_cfg: dict,
+                 test_cfg: dict,
+                 pred_layer_cfg: Optional[dict] = None,
+                 enlarge_width: float = 0.1,
+                 cls_loss: Optional[dict] = None,
+                 bbox_loss: Optional[dict] = None,
+                 bbox_coder: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None) -> None:
        super().__init__(init_cfg=init_cfg)
        self.num_classes = num_classes
        self.train_cfg = train_cfg
@@ -50,8 +56,8 @@ class PointRPNHead(BaseModule):
        self.enlarge_width = enlarge_width

        # build loss function
-        self.bbox_loss = build_loss(bbox_loss)
-        self.cls_loss = build_loss(cls_loss)
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)

        # build box coder
        self.bbox_coder = TASK_UTILS.build(bbox_coder)
@@ -67,7 +73,8 @@ class PointRPNHead(BaseModule):
            input_channels=pred_layer_cfg.in_channels,
            output_channels=self._get_reg_out_channels())

-    def _make_fc_layers(self, fc_cfg, input_channels, output_channels):
+    def _make_fc_layers(self, fc_cfg: dict, input_channels: int,
+                        output_channels: int) -> nn.Sequential:
        """Make fully connect layers.

        Args:
@@ -102,7 +109,7 @@ class PointRPNHead(BaseModule):
        # torch.cos(yaw) (1), torch.sin(yaw) (1)
        return self.bbox_coder.code_size

-    def forward(self, feat_dict):
+    def forward(self, feat_dict: dict) -> Tuple[List[Tensor]]:
        """Forward pass.

        Args:
@@ -124,30 +131,35 @@ class PointRPNHead(BaseModule):
            batch_size, -1, self._get_reg_out_channels())
        return point_box_preds, point_cls_preds

-    def loss(self,
-             bbox_preds,
-             cls_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             img_metas=None):
+    def loss_by_feat(
+            self,
+            bbox_preds: List[Tensor],
+            cls_preds: List[Tensor],
+            points: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_input_metas: Optional[List[dict]] = None,
+            batch_gt_instances_ignore: Optional[InstanceList] = None) -> Dict:
        """Compute loss.

        Args:
-            bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head.
-            cls_preds (dict): Classification from forward of PointRCNN
-                RPN_Head.
+            bbox_preds (list[torch.Tensor]): Predictions from forward of
+                PointRCNN RPN_Head.
+            cls_preds (list[torch.Tensor]): Classification from forward of
+                PointRCNN RPN_Head.
            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            img_metas (list[dict], Optional): Contain pcd and img's meta info.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
                Defaults to None.

        Returns:
            dict: Losses of PointRCNN RPN module.
        """
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d)
+        targets = self.get_targets(points, batch_gt_instances_3d)
        (bbox_targets, mask_targets, positive_mask, negative_mask,
         box_loss_weights, point_targets) = targets

@@ -169,25 +181,25 @@ class PointRPNHead(BaseModule):

        return losses

-    def get_targets(self, points, gt_bboxes_3d, gt_labels_3d):
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances_3d: InstanceList) -> Tuple[Tensor]:
        """Generate targets of PointRCNN RPN head.

        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+            points (list[torch.Tensor]): Points in one batch.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.

        Returns:
            tuple[torch.Tensor]: Targets of PointRCNN RPN head.
        """
-        # find empty example
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+        gt_labels_3d = [
+            instances.labels_3d for instances in batch_gt_instances_3d
+        ]
+        gt_bboxes_3d = [
+            instances.bboxes_3d for instances in batch_gt_instances_3d
+        ]

        (bbox_targets, mask_targets, positive_mask, negative_mask,
         point_targets) = multi_apply(self.get_targets_single, points,
@@ -202,7 +214,9 @@ class PointRPNHead(BaseModule):
        return (bbox_targets, mask_targets, positive_mask, negative_mask,
                box_loss_weights, point_targets)

-    def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d):
+    def get_targets_single(self, points: Tensor,
+                           gt_bboxes_3d: BaseInstance3DBoxes,
+                           gt_labels_3d: Tensor) -> Tuple[Tensor]:
        """Generate targets of PointRCNN RPN head for single batch.

        Args:
@@ -243,24 +257,34 @@ class PointRPNHead(BaseModule):
        return (bbox_targets, mask_targets, positive_mask, negative_mask,
                point_targets)

-    def get_bboxes(self,
-                   points,
-                   bbox_preds,
-                   cls_preds,
-                   input_metas,
-                   rescale=False):
+    def predict_by_feat(self, points: Tensor, bbox_preds: List[Tensor],
+                        cls_preds: List[Tensor], batch_input_metas: List[dict],
+                        cfg: Optional[dict]) -> InstanceList:
        """Generate bboxes from RPN head predictions.

        Args:
            points (torch.Tensor): Input points.
-            bbox_preds (dict): Regression predictions from PointRCNN head.
-            cls_preds (dict): Class scores predictions from PointRCNN head.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            rescale (bool, optional): Whether to rescale bboxes.
-                Defaults to False.
+            bbox_preds (list[tensor]): Regression predictions from PointRCNN
+                head.
+            cls_preds (list[tensor]): Class scores predictions from PointRCNN
+                head.
+            batch_input_metas (list[dict]): Batch inputs meta info.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration.

        Returns:
-            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+            - cls_preds (torch.Tensor): Class score of each bbox.
        """
        sem_scores = cls_preds.sigmoid()
        obj_scores = sem_scores.max(-1)[0]
@@ -271,30 +295,40 @@ class PointRPNHead(BaseModule):
        for b in range(batch_size):
            bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3],
                                            object_class[b])
+            mask = ~bbox3d.sum(dim=1).isinf()
            bbox_selected, score_selected, labels, cls_preds_selected = \
-                self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d,
-                                        points[b, ..., :3], input_metas[b])
-            bbox = input_metas[b]['box_type_3d'](
-                bbox_selected.clone(),
-                box_dim=bbox_selected.shape[-1],
-                with_yaw=True)
-            results.append((bbox, score_selected, labels, cls_preds_selected))
+                self.class_agnostic_nms(obj_scores[b][mask],
+                                        sem_scores[b][mask, :],
+                                        bbox3d[mask, :],
+                                        points[b, ..., :3][mask, :],
+                                        batch_input_metas[b],
+                                        cfg.nms_cfg)
+            bbox_selected = batch_input_metas[b]['box_type_3d'](
+                bbox_selected, box_dim=bbox_selected.shape[-1])
+            result = InstanceData()
+            result.bboxes_3d = bbox_selected
+            result.scores_3d = score_selected
+            result.labels_3d = labels
+            result.cls_preds = cls_preds_selected
+            results.append(result)
        return results

-    def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points,
-                           input_meta):
+    def class_agnostic_nms(self, obj_scores: Tensor, sem_scores: Tensor,
+                           bbox: Tensor, points: Tensor, input_meta: Dict,
+                           nms_cfg: Dict) -> Tuple[Tensor]:
        """Class agnostic nms.

        Args:
            obj_scores (torch.Tensor): Objectness score of bounding boxes.
            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Contain pcd and img's meta info.
+            nms_cfg (dict): NMS config dict.

        Returns:
            tuple[torch.Tensor]: Bounding boxes, scores and labels.
        """
-        nms_cfg = self.test_cfg.nms_cfg if not self.training \
-            else self.train_cfg.nms_cfg
        if nms_cfg.use_rotate_nms:
            nms_func = nms_bev
        else:
@@ -323,14 +357,14 @@ class PointRPNHead(BaseModule):

        bbox = bbox[nonempty_box_mask]

-        if self.test_cfg.score_thr is not None:
-            score_thr = self.test_cfg.score_thr
+        if nms_cfg.score_thr is not None:
+            score_thr = nms_cfg.score_thr
            keep = (obj_scores >= score_thr)
            obj_scores = obj_scores[keep]
            sem_scores = sem_scores[keep]
            bbox = bbox.tensor[keep]

-        if obj_scores.shape[0] > 0:
+        if bbox.tensor.shape[0] > 0:
            topk = min(nms_cfg.nms_pre, obj_scores.shape[0])
            obj_scores_nms, indices = torch.topk(obj_scores, k=topk)
            bbox_for_nms = xywhr2xyxyr(bbox[indices].bev)
@@ -343,15 +377,22 @@ class PointRPNHead(BaseModule):
            score_selected = obj_scores_nms[keep]
            cls_preds = sem_scores_nms[keep]
            labels = torch.argmax(cls_preds, -1)
+            if bbox_selected.shape[0] > nms_cfg.nms_post:
+                _, inds = score_selected.sort(descending=True)
+                inds = inds[:score_selected.nms_post]
+                bbox_selected = bbox_selected[inds, :]
+                labels = labels[inds]
+                score_selected = score_selected[inds]
+                cls_preds = cls_preds[inds, :]
        else:
            bbox_selected = bbox.tensor
            score_selected = obj_scores.new_zeros([0])
            labels = obj_scores.new_zeros([0])
            cls_preds = obj_scores.new_zeros([0, sem_scores.shape[-1]])
-
        return bbox_selected, score_selected, labels, cls_preds

-    def _assign_targets_by_points_inside(self, bboxes_3d, points):
+    def _assign_targets_by_points_inside(self, bboxes_3d: BaseInstance3DBoxes,
+                                         points: Tensor) -> Tuple[Tensor]:
        """Compute assignment by checking whether point is inside bbox.

        Args:
@@ -379,3 +420,92 @@ class PointRPNHead(BaseModule):
            raise NotImplementedError('Unsupported bbox type!')

        return points_mask, assignment
+
+    def predict(self, feats_dict: Dict,
+                batch_data_samples: SampleList) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        raw_points = feats_dict.pop('raw_points')
+        bbox_preds, cls_preds = self(feats_dict)
+        proposal_cfg = self.test_cfg
+
+        proposal_list = self.predict_by_feat(
+            raw_points,
+            bbox_preds,
+            cls_preds,
+            cfg=proposal_cfg,
+            batch_input_metas=batch_input_metas)
+        feats_dict['points_cls_preds'] = cls_preds
+        return proposal_list
+
+    def loss_and_predict(self,
+                         feats_dict: Dict,
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[dict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            proposal_cfg (ConfigDict, optional): Proposal config.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each sample after the post process.
+        """
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+        raw_points = feats_dict.pop('raw_points')
+        bbox_preds, cls_preds = self(feats_dict)
+
+        loss_inputs = (bbox_preds, cls_preds,
+                       raw_points) + (batch_gt_instances_3d, batch_input_metas,
+                                      batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            raw_points,
+            bbox_preds,
+            cls_preds,
+            batch_input_metas=batch_input_metas,
+            cfg=proposal_cfg)
+        feats_dict['points_cls_preds'] = cls_preds
+        if predictions[0].bboxes_3d.tensor.isinf().any():
+            print(predictions)
+        return losses, predictions
--- a/mmdet3d/models/dense_heads/ssd_3d_head.py
+++ b/mmdet3d/models/dense_heads/ssd_3d_head.py
@@ -14,7 +14,6 @@ from mmdet3d.structures.bbox_3d import (DepthInstance3DBoxes,
                                        LiDARInstance3DBoxes,
                                        rotation_3d_in_axis)
 from mmdet.models.utils import multi_apply
-from ..builder import build_loss
 from .vote_head import VoteHead


@@ -76,8 +75,8 @@ class SSD3DHead(VoteHead):
            size_res_loss=size_res_loss,
            semantic_loss=None,
            init_cfg=init_cfg)
-        self.corner_loss = build_loss(corner_loss)
-        self.vote_loss = build_loss(vote_loss)
+        self.corner_loss = MODELS.build(corner_loss)
+        self.vote_loss = MODELS.build(vote_loss)
        self.num_candidates = vote_module_cfg['num_points']

    def _get_cls_out_channels(self) -> int:

--- a/mmdet3d/models/detectors/__init__.py
+++ b/mmdet3d/models/detectors/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import Base3DDetector
 from .centerpoint import CenterPoint
+from .dfm import DfM
 from .dynamic_voxelnet import DynamicVoxelNet
 from .fcos_mono3d import FCOSMono3D
 from .groupfree3dnet import GroupFree3DNet
 from .h3dnet import H3DNet
 from .imvotenet import ImVoteNet
 from .imvoxelnet import ImVoxelNet
+from .multiview_dfm import MultiViewDfM
 from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
 from .mvx_two_stage import MVXTwoStageDetector
 from .parta2 import PartA2
@@ -19,9 +21,25 @@ from .votenet import VoteNet
 from .voxelnet import VoxelNet

 __all__ = [
-    'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
-    'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
-    'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
-    'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
-    'SASSD'
+    'Base3DDetector',
+    'DfM',
+    'VoxelNet',
+    'DynamicVoxelNet',
+    'MVXTwoStageDetector',
+    'DynamicMVXFasterRCNN',
+    'MVXFasterRCNN',
+    'MultiViewDfM',
+    'PartA2',
+    'VoteNet',
+    'H3DNet',
+    'CenterPoint',
+    'SSD3DNet',
+    'ImVoteNet',
+    'SingleStageMono3DDetector',
+    'FCOSMono3D',
+    'ImVoxelNet',
+    'GroupFree3DNet',
+    'PointRCNN',
+    'SMOKEMono3D',
+    'SASSD',
 ]
--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
@@ -89,7 +89,7 @@ class Base3DDetector(BaseDetector):
            raise RuntimeError(f'Invalid mode "{mode}". '
                               'Only supports loss, predict and tensor mode')

-    def convert_to_datasample(
+    def add_pred_to_datasample(
        self,
        data_samples: SampleList,
        data_instances_3d: OptInstanceList = None,

--- a/mmdet3d/models/detectors/dfm.py
+++ b/mmdet3d/models/detectors/dfm.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.ops import bbox3d2result
+from mmdet3d.utils import ConfigType
+from mmdet.models.detectors import BaseDetector
+from ..builder import build_backbone, build_head, build_neck
+
+
+@MODELS.register_module()
+class DfM(BaseDetector):
+    r"""`Monocular 3D Object Detection with Depth from Motion.
+        <https://arxiv.org/abs/2207.12988>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
+        config.
+        backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head_3d (:obj:`ConfigDict` or dict): The 3d bbox head config.
+        neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
+            for 2D object detection. Defaults to None.
+        bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
+            head config for 2D object detection. Defaults to None.
+        depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
+            head config for depth estimation in fov space. Defaults to None.
+        depth_head (:obj:`ConfigDict` or dict, optional): The depth head
+            config for depth estimation in 3D voxel projected to fov space .
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        pretrained (:obj: `ConfigDict` or dict optional): The pretrained
+            config.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 backbone_stereo: ConfigType,
+                 backbone_3d: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head_3d: ConfigType,
+                 neck_2d=None,
+                 bbox_head_2d=None,
+                 depth_head_2d=None,
+                 depth_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck)
+        if backbone_stereo is not None:
+            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
+            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
+            self.backbone_stereo = build_backbone(backbone_stereo)
+            assert self.neck.cat_img_feature == \
+                self.backbone_stereo.cat_img_feature
+            assert self.neck.sem_channels[
+                -1] == self.backbone_stereo.in_sem_channels
+        if backbone_3d is not None:
+            self.backbone_3d = build_backbone(backbone_3d)
+        if neck_3d is not None:
+            self.neck_3d = build_neck(neck_3d)
+        if neck_2d is not None:
+            self.neck_2d = build_neck(neck_2d)
+        if bbox_head_2d is not None:
+            self.bbox_head_2d = build_head(bbox_head_2d)
+        if depth_head_2d is not None:
+            self.depth_head_2d = build_head(depth_head_2d)
+        if depth_head is not None:
+            self.depth_head = build_head(depth_head)
+            self.depth_samples = self.depth_head.depth_samples
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        bbox_head_3d.update(train_cfg=train_cfg)
+        bbox_head_3d.update(test_cfg=test_cfg)
+        self.bbox_head_3d = build_head(bbox_head_3d)
+
+    @property
+    def with_backbone_3d(self):
+        """Whether the detector has a 3D backbone."""
+        return hasattr(self, 'backbone_3d') and self.backbone_3d is not None
+
+    @property
+    def with_neck_3d(self):
+        """Whether the detector has a 3D neck."""
+        return hasattr(self, 'neck_3d') and self.neck_3d is not None
+
+    @property
+    def with_neck_2d(self):
+        """Whether the detector has a 2D neck."""
+        return hasattr(self, 'neck_2d') and self.neck_2d is not None
+
+    @property
+    def with_bbox_head_2d(self):
+        """Whether the detector has a 2D detection head."""
+        return hasattr(self, 'bbox_head_2d') and self.bbox_head_2d is not None
+
+    @property
+    def with_depth_head_2d(self):
+        """Whether the detector has a image-based depth head."""
+        return hasattr(self,
+                       'depth_head_2d') and self.depth_head_2d is not None
+
+    @property
+    def with_depth_head(self):
+        """Whether the detector has a frustum-based depth head."""
+        return hasattr(self, 'depth_head') and self.depth_head is not None
+
+    def extract_feat(self, img, img_metas):
+        """Feature extraction for perspective-view images.
+
+        Args:
+            img (torch.Tensor): Images of shape [B, N, C_in, H, W].
+            img_metas (list): Image meta information. Each element corresponds
+                to a group of images. len(img_metas) == B.
+
+        Returns:
+            torch.Tensor: bev feature with shape [B, C_out, N_y, N_x].
+        """
+        # split input img into current and previous ones
+        batch_size, N, C_in, H, W = img.shape
+        cur_imgs = img[:, 0]
+        prev_imgs = img[:, 1]  # TODO: to support multiple prev imgs
+        # 2D backbone for feature extraction
+        cur_feats = self.backbone(cur_imgs)
+        cur_feats = [cur_imgs] + list(cur_feats)
+        prev_feats = self.backbone(prev_imgs)
+        prev_feats = [prev_imgs] + list(prev_feats)
+        # SPP module as the feature neck
+        cur_stereo_feat, cur_sem_feat = self.neck(cur_feats)
+        prev_stereo_feat, prev_sem_feat = self.neck(prev_feats)
+        # derive cur2prevs
+        cur_pose = torch.tensor(
+            [img_meta['cam2global'] for img_meta in img_metas],
+            device=img.device)[:, None, :, :]  # (B, 1, 4, 4)
+        prev_poses = []
+        for img_meta in img_metas:
+            sweep_img_metas = img_meta['sweep_img_metas']
+            prev_poses.append([
+                sweep_img_meta['cam2global']
+                for sweep_img_meta in sweep_img_metas
+            ])
+        prev_poses = torch.tensor(prev_poses, device=img.device)
+        pad_prev_cam2global = torch.eye(4)[None, None].expand(
+            batch_size, N - 1, 4, 4).to(img.device)
+        pad_prev_cam2global[:, :, :prev_poses.shape[-2], :prev_poses.
+                            shape[-1]] = prev_poses
+        pad_cur_cam2global = torch.eye(4)[None,
+                                          None].expand(batch_size, 1, 4,
+                                                       4).to(img.device)
+        pad_cur_cam2global[:, :, :cur_pose.shape[-2], :cur_pose.
+                           shape[-1]] = cur_pose
+        # (B, N-1, 4, 4) * (B, 1, 4, 4) -> (B, N-1, 4, 4)
+        # torch.linalg.solve is faster and more numerically stable
+        # than torch.matmul(torch.linalg.inv(A), B)
+        # empirical results show that torch.linalg.solve can derive
+        # almost the same result with np.linalg.inv
+        # while torch.linalg.inv can not
+        cur2prevs = torch.linalg.solve(pad_prev_cam2global, pad_cur_cam2global)
+        for meta_idx, img_meta in enumerate(img_metas):
+            img_meta['cur2prevs'] = cur2prevs[meta_idx]
+        # stereo backbone for depth estimation
+        # volume_feat: (batch_size, Cv, Nz, Ny, Nx)
+        volume_feat = self.backbone_stereo(cur_stereo_feat, prev_stereo_feat,
+                                           img_metas, cur_sem_feat)
+        # height compression
+        _, Cv, Nz, Ny, Nx = volume_feat.shape
+        bev_feat = volume_feat.view(batch_size, Cv * Nz, Ny, Nx)
+        bev_feat_prehg, bev_feat = self.neck_3d(bev_feat)
+        return bev_feat
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      depth_img=None,
+                      **kwargs):
+        """Forward function for training."""
+        bev_feat = self.extract_feat(img, img_metas)
+        outs = self.bbox_head_3d([bev_feat])
+        losses = self.bbox_head_3d.loss(*outs, gt_bboxes_3d, gt_labels_3d,
+                                        img_metas)
+        # TODO: loss_dense_depth, loss_2d, loss_imitation
+        return losses
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Forward of testing.
+
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        # not supporting aug_test for now
+        return self.simple_test(img, img_metas)
+
+    def simple_test(self, img, img_metas):
+        """Simple inference forward without test time augmentation."""
+        bev_feat = self.extract_feat(img, img_metas)
+        # bbox_head takes a list of feature from different levels as input
+        # so need [bev_feat]
+        outs = self.bbox_head_3d([bev_feat])
+        bbox_list = self.bbox_head_3d.get_bboxes(*outs, img_metas)
+        bbox_results = [
+            bbox3d2result(det_bboxes, det_scores, det_labels)
+            for det_bboxes, det_scores, det_labels in bbox_list
+        ]
+        # add pseudo-lidar label to each pred_dict for post-processing
+        for bbox_result in bbox_results:
+            bbox_result['pseudo_lidar'] = True
+        return bbox_results
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
--- a/mmdet3d/models/detectors/dynamic_voxelnet.py
+++ b/mmdet3d/models/detectors/dynamic_voxelnet.py
@@ -10,7 +10,8 @@ from .voxelnet import VoxelNet

 @MODELS.register_module()
 class DynamicVoxelNet(VoxelNet):
-    r"""VoxelNet using `dynamic voxelization <https://arxiv.org/abs/1910.06528>`_.
+    r"""VoxelNet using `dynamic voxelization
+    <https://arxiv.org/abs/1910.06528>`_.
    """

    def __init__(self,

--- a/mmdet3d/models/detectors/fcos_mono3d.py
+++ b/mmdet3d/models/detectors/fcos_mono3d.py
@@ -95,6 +95,7 @@ class FCOSMono3D(SingleStageMono3DDetector):
        x = self.extract_feat(batch_inputs_dict)
        results_list, results_list_2d = self.bbox_head.predict(
            x, batch_data_samples, rescale=rescale)
-        predictions = self.convert_to_datasample(batch_data_samples,
-                                                 results_list, results_list_2d)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list,
+                                                  results_list_2d)
        return predictions
--- a/mmdet3d/models/detectors/groupfree3dnet.py
+++ b/mmdet3d/models/detectors/groupfree3dnet.py
@@ -82,6 +82,6 @@ class GroupFree3DNet(SingleStage3DDetector):
        points = batch_inputs_dict['points']
        results_list = self.bbox_head.predict(points, x, batch_data_samples,
                                              **kwargs)
-        predictions = self.convert_to_datasample(batch_data_samples,
-                                                 results_list)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
        return predictions
--- a/mmdet3d/models/detectors/h3dnet.py
+++ b/mmdet3d/models/detectors/h3dnet.py
@@ -154,4 +154,4 @@ class H3DNet(TwoStage3DDetector):
            feats_dict,
            batch_data_samples,
            suffix='_optimized')
-        return self.convert_to_datasample(batch_data_samples, results_list)
+        return self.add_pred_to_datasample(batch_data_samples, results_list)
--- a/mmdet3d/models/detectors/imvotenet.py
+++ b/mmdet3d/models/detectors/imvotenet.py
@@ -433,7 +433,7 @@ class ImVoteNet(Base3DDetector):
        if points is None:
            assert imgs is not None
            results_2d = self.predict_img_only(imgs, batch_data_samples)
-            return self.convert_to_datasample(
+            return self.add_pred_to_datasample(
                batch_data_samples, data_instances_2d=results_2d)

        else:
@@ -488,7 +488,7 @@ class ImVoteNet(Base3DDetector):
                batch_data_samples,
                rescale=True)

-            return self.convert_to_datasample(batch_data_samples, results_3d)
+            return self.add_pred_to_datasample(batch_data_samples, results_3d)

    def predict_img_only(self,
                         imgs: Tensor,

--- a/mmdet3d/models/detectors/imvoxelnet.py
+++ b/mmdet3d/models/detectors/imvoxelnet.py
@@ -2,16 +2,17 @@
 from typing import List, Tuple, Union

 import torch
+from mmengine.structures import InstanceData

+from mmdet3d.models.detectors import Base3DDetector
 from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.det3d_data_sample import SampleList
-from mmdet3d.utils import ConfigType, InstanceList, OptConfigType
-from mmdet.models.detectors import BaseDetector
+from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList


 @MODELS.register_module()
-class ImVoxelNet(BaseDetector):
+class ImVoxelNet(Base3DDetector):
    r"""`ImVoxelNet <https://arxiv.org/abs/2106.01178>`_.

    Args:
@@ -57,31 +58,6 @@ class ImVoxelNet(BaseDetector):
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

-    def convert_to_datasample(self, data_samples: SampleList,
-                              data_instances: InstanceList) -> SampleList:
-        """ Convert results list to `Det3DDataSample`.
-        Args:
-            inputs (list[:obj:`Det3DDataSample`]): The input data.
-            data_instances (list[:obj:`InstanceData`]): 3D Detection
-                results of each image.
-        Returns:
-            list[:obj:`Det3DDataSample`]: 3D Detection results of the
-            input images. Each Det3DDataSample usually contain
-            'pred_instances_3d'. And the ``pred_instances_3d`` usually
-            contains following keys.
-
-                - scores_3d (Tensor): Classification scores, has a shape
-                    (num_instance, )
-                - labels_3d (Tensor): Labels of bboxes, has a shape
-                    (num_instances, ).
-                - bboxes_3d (Tensor): Contains a tensor with shape
-                    (num_instances, C) where C >=7.
-            """
-        for data_sample, pred_instances_3d in zip(data_samples,
-                                                  data_instances):
-            data_sample.pred_instances_3d = pred_instances_3d
-        return data_samples
-
    def extract_feat(self, batch_inputs_dict: dict,
                     batch_data_samples: SampleList):
        """Extract 3d features from the backbone -> fpn -> 3d projection.
@@ -185,8 +161,8 @@ class ImVoxelNet(BaseDetector):
        """
        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
        results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
-        predictions = self.convert_to_datasample(batch_data_samples,
-                                                 results_list)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
        return predictions

    def _forward(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
@@ -209,3 +185,64 @@ class ImVoxelNet(BaseDetector):
        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
        results = self.bbox_head.forward(x)
        return results
+
+    def convert_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples