Bump version to v1.1.0rc2

Bump to v1.1.0rc2

Bump version to v1.1.0rc2
Bump to v1.1.0rc2
d7067e44 · Wenwei Zhang · GitHub · 28fe73d2 · fb0e57e5 · d7067e44
Unverified Commit d7067e44 authored Dec 03, 2022 by Wenwei Zhang Committed by GitHub Dec 03, 2022
20 changed files
--- a/mmdet3d/evaluation/metrics/__init__.py
+++ b/mmdet3d/evaluation/metrics/__init__.py
@@ -7,7 +7,7 @@ from .nuscenes_metric import NuScenesMetric  # noqa: F401,F403
 from .seg_metric import SegMetric  # noqa: F401,F403
 from .waymo_metric import WaymoMetric  # noqa: F401,F403
-__all_ = [
+__all__ = [
    'KittiMetric', 'NuScenesMetric', 'IndoorMetric', 'LyftMetric', 'SegMetric',
    'InstanceSegMetric', 'WaymoMetric'
 ]
--- a/mmdet3d/evaluation/metrics/indoor_metric.py
+++ b/mmdet3d/evaluation/metrics/indoor_metric.py
@@ -3,13 +3,13 @@ from collections import OrderedDict
 from typing import Dict, List, Optional, Sequence
 import numpy as np
+from mmdet.evaluation import eval_map
 from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger
 from mmdet3d.evaluation import indoor_eval
 from mmdet3d.registry import METRICS
 from mmdet3d.structures import get_box_type
-from mmdet.evaluation import eval_map
 @METRICS.register_module()
@@ -78,14 +78,15 @@ class IndoorMetric(BaseMetric):
            ann_infos.append(eval_ann)
            pred_results.append(sinlge_pred_results)
+        # some checkpoints may not record the key "box_type_3d"
        box_type_3d, box_mode_3d = get_box_type(
-            self.dataset_meta['box_type_3d'])
+            self.dataset_meta.get('box_type_3d', 'depth'))
        ret_dict = indoor_eval(
            ann_infos,
            pred_results,
            self.iou_thr,
-            self.dataset_meta['CLASSES'],
+            self.dataset_meta['classes'],
            logger=logger,
            box_mode_3d=box_mode_3d)
@@ -141,7 +142,7 @@ class Indoor2DMetric(BaseMetric):
            pred_labels = pred['labels'].cpu().numpy()
            dets = []
-            for label in range(len(self.dataset_meta['CLASSES'])):
+            for label in range(len(self.dataset_meta['classes'])):
                index = np.where(pred_labels == label)[0]
                pred_bbox_scores = np.hstack(
                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
@@ -170,7 +171,7 @@ class Indoor2DMetric(BaseMetric):
                annotations,
                scale_ranges=None,
                iou_thr=iou_thr_2d_single,
-                dataset=self.dataset_meta['CLASSES'],
+                dataset=self.dataset_meta['classes'],
                logger=logger)
            eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
        return eval_results
--- a/mmdet3d/evaluation/metrics/instance_seg_metric.py
+++ b/mmdet3d/evaluation/metrics/instance_seg_metric.py
@@ -64,7 +64,7 @@ class InstanceSegMetric(BaseMetric):
        """
        logger: MMLogger = MMLogger.get_current_instance()
-        self.classes = self.dataset_meta['CLASSES']
+        self.classes = self.dataset_meta['classes']
        self.valid_class_ids = self.dataset_meta['seg_valid_class_ids']
        gt_semantic_masks = []

--- a/mmdet3d/evaluation/metrics/kitti_metric.py
+++ b/mmdet3d/evaluation/metrics/kitti_metric.py
@@ -36,6 +36,10 @@ class KittiMetric(BaseMetric):
            If not specified, a temp file will be created. Default: None.
        default_cam_key (str, optional): The default camera for lidar to
            camear conversion. By default, KITTI: CAM2, Waymo: CAM_FRONT
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
        submission_prefix (str, optional): The prefix of submission data.
            If not specified, the submission data will not be generated.
            Default: None.
@@ -52,6 +56,7 @@ class KittiMetric(BaseMetric):
                 prefix: Optional[str] = None,
                 pklfile_prefix: str = None,
                 default_cam_key: str = 'CAM2',
+                 format_only: bool = False,
                 submission_prefix: str = None,
                 collect_device: str = 'cpu',
                 file_client_args: dict = dict(backend='disk')):
@@ -61,6 +66,13 @@ class KittiMetric(BaseMetric):
        self.pcd_limit_range = pcd_limit_range
        self.ann_file = ann_file
        self.pklfile_prefix = pklfile_prefix
+        self.format_only = format_only
+        if self.format_only:
+            assert submission_prefix is not None, 'submission_prefix must be'
+            'not None when format_only is True, otherwise the result files'
+            'will be saved to a temp directory which will be cleaned up at'
+            'the end.'
        self.submission_prefix = submission_prefix
        self.pred_box_type_3d = pred_box_type_3d
        self.default_cam_key = default_cam_key
@@ -74,68 +86,62 @@ class KittiMetric(BaseMetric):
                raise KeyError("metric should be one of 'bbox', 'img_bbox', "
                               'but got {metric}.')
-    def convert_annos_to_kitti_annos(
+    def convert_annos_to_kitti_annos(self, data_infos: dict) -> list:
-        self,
-        data_annos: list,
-        classes: list = [
-            'Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck', 'Person_sitting',
-            'Tram', 'Misc'
-        ]
-    ) -> list:
        """Convert loading annotations to Kitti annotations.
        Args:
-            data_annos (list[dict]): Annotations loaded from ann_file.
+            data_infos (dict): Data infos including metainfo and annotations
-            classes (list[str]): Classes used in the dataset. Default used
+                loaded from ann_file.
-                ['Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
-                'Person_sitting', 'Tram', 'Misc'].
        Returns:
            List[dict]: List of Kitti annotations.
        """
-        assert 'instances' in data_annos[0]
+        data_annos = data_infos['data_list']
-        for i, annos in enumerate(data_annos):
+        if not self.format_only:
-            if len(annos['instances']) == 0:
+            cat2label = data_infos['metainfo']['categories']
-                kitti_annos = {
+            label2cat = dict((v, k) for (k, v) in cat2label.items())
-                    'name': np.array([]),
+            assert 'instances' in data_annos[0]
-                    'truncated': np.array([]),
+            for i, annos in enumerate(data_annos):
-                    'occluded': np.array([]),
+                if len(annos['instances']) == 0:
-                    'alpha': np.array([]),
+                    kitti_annos = {
-                    'bbox': np.zeros([0, 4]),
+                        'name': np.array([]),
-                    'dimensions': np.zeros([0, 3]),
+                        'truncated': np.array([]),
-                    'location': np.zeros([0, 3]),
+                        'occluded': np.array([]),
-                    'rotation_y': np.array([]),
+                        'alpha': np.array([]),
-                    'score': np.array([]),
+                        'bbox': np.zeros([0, 4]),
-                }
+                        'dimensions': np.zeros([0, 3]),
-            else:
+                        'location': np.zeros([0, 3]),
-                kitti_annos = {
+                        'rotation_y': np.array([]),
-                    'name': [],
+                        'score': np.array([]),
-                    'truncated': [],
+                    }
-                    'occluded': [],
+                else:
-                    'alpha': [],
+                    kitti_annos = {
-                    'bbox': [],
+                        'name': [],
-                    'location': [],
+                        'truncated': [],
-                    'dimensions': [],
+                        'occluded': [],
-                    'rotation_y': [],
+                        'alpha': [],
-                    'score': []
+                        'bbox': [],
-                }
+                        'location': [],
-                for instance in annos['instances']:
+                        'dimensions': [],
-                    labels = instance['bbox_label']
+                        'rotation_y': [],
-                    if labels == -1:
+                        'score': []
-                        kitti_annos['name'].append('DontCare')
+                    }
-                    else:
+                    for instance in annos['instances']:
-                        kitti_annos['name'].append(classes[labels])
+                        label = instance['bbox_label']
-                    kitti_annos['truncated'].append(instance['truncated'])
+                        kitti_annos['name'].append(label2cat[label])
-                    kitti_annos['occluded'].append(instance['occluded'])
+                        kitti_annos['truncated'].append(instance['truncated'])
-                    kitti_annos['alpha'].append(instance['alpha'])
+                        kitti_annos['occluded'].append(instance['occluded'])
-                    kitti_annos['bbox'].append(instance['bbox'])
+                        kitti_annos['alpha'].append(instance['alpha'])
-                    kitti_annos['location'].append(instance['bbox_3d'][:3])
+                        kitti_annos['bbox'].append(instance['bbox'])
-                    kitti_annos['dimensions'].append(instance['bbox_3d'][3:6])
+                        kitti_annos['location'].append(instance['bbox_3d'][:3])
-                    kitti_annos['rotation_y'].append(instance['bbox_3d'][6])
+                        kitti_annos['dimensions'].append(
-                    kitti_annos['score'].append(instance['score'])
+                            instance['bbox_3d'][3:6])
-                for name in kitti_annos:
+                        kitti_annos['rotation_y'].append(
-                    kitti_annos[name] = np.array(kitti_annos[name])
+                            instance['bbox_3d'][6])
-            data_annos[i]['kitti_annos'] = kitti_annos
+                        kitti_annos['score'].append(instance['score'])
+                    for name in kitti_annos:
+                        kitti_annos[name] = np.array(kitti_annos[name])
+                data_annos[i]['kitti_annos'] = kitti_annos
        return data_annos
    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
@@ -176,24 +182,29 @@ class KittiMetric(BaseMetric):
            the metrics, and the values are corresponding results.
        """
        logger: MMLogger = MMLogger.get_current_instance()
-        self.classes = self.dataset_meta['CLASSES']
+        self.classes = self.dataset_meta['classes']
        # load annotations
-        pkl_annos = load(
+        pkl_infos = load(self.ann_file, file_client_args=self.file_client_args)
-            self.ann_file, file_client_args=self.file_client_args)['data_list']
+        self.data_infos = self.convert_annos_to_kitti_annos(pkl_infos)
-        self.data_infos = self.convert_annos_to_kitti_annos(pkl_annos)
        result_dict, tmp_dir = self.format_results(
            results,
            pklfile_prefix=self.pklfile_prefix,
            submission_prefix=self.submission_prefix,
            classes=self.classes)
+        metric_dict = {}
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(self.submission_prefix)}')
+            return metric_dict
        gt_annos = [
            self.data_infos[result['sample_idx']]['kitti_annos']
            for result in results
        ]
-        metric_dict = {}
        for metric in self.metrics:
            ap_dict = self.kitti_evaluate(
                result_dict,
@@ -331,7 +342,7 @@ class KittiMetric(BaseMetric):
            mmengine.mkdir_or_exist(submission_prefix)
        det_annos = []
-        print('\nConverting prediction to KITTI format')
+        print('\nConverting 3D prediction to KITTI format')
        for idx, pred_dicts in enumerate(
                mmengine.track_iter_progress(net_outputs)):
            annos = []
@@ -457,7 +468,7 @@ class KittiMetric(BaseMetric):
        assert len(net_outputs) == len(self.data_infos), \
            'invalid list length of network outputs'
        det_annos = []
-        print('\nConverting prediction to KITTI format')
+        print('\nConverting 2D prediction to KITTI format')
        for i, bboxes_per_sample in enumerate(
                mmengine.track_iter_progress(net_outputs)):
            annos = []
@@ -526,7 +537,7 @@ class KittiMetric(BaseMetric):
            mmengine.mkdir_or_exist(submission_prefix)
            print(f'Saving KITTI submission to {submission_prefix}')
            for i, anno in enumerate(det_annos):
-                sample_idx = self.data_infos[i]['image']['image_idx']
+                sample_idx = sample_id_list[i]
                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
                with open(cur_det_file, 'w') as f:
                    bbox = anno['bbox']

--- a/mmdet3d/evaluation/metrics/lyft_metric.py
+++ b/mmdet3d/evaluation/metrics/lyft_metric.py
@@ -110,7 +110,7 @@ class LyftMetric(BaseMetric):
        """
        logger: MMLogger = MMLogger.get_current_instance()
-        classes = self.dataset_meta['CLASSES']
+        classes = self.dataset_meta['classes']
        self.version = self.dataset_meta['version']
        # load annotations

--- a/mmdet3d/evaluation/metrics/nuscenes_metric.py
+++ b/mmdet3d/evaluation/metrics/nuscenes_metric.py
@@ -151,7 +151,7 @@ class NuScenesMetric(BaseMetric):
        """
        logger: MMLogger = MMLogger.get_current_instance()
-        classes = self.dataset_meta['CLASSES']
+        classes = self.dataset_meta['classes']
        self.version = self.dataset_meta['version']
        # load annotations
        self.data_infos = load(

--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -36,14 +36,24 @@ class WaymoMetric(KittiMetric):
            names to disambiguate homonymous metrics of different evaluators.
            If prefix is not provided in the argument, self.default_prefix
            will be used instead. Defaults to None.
+        convert_kitti_format (bool, optional): Whether convert the reuslts to
+            kitti format. Now, in order to be compatible with camera-based
+            methods, defaults to True.
        pklfile_prefix (str, optional): The prefix of pkl files, including
            the file path and the prefix of filename, e.g., "a/b/prefix".
            If not specified, a temp file will be created. Default: None.
        submission_prefix (str, optional): The prefix of submission data.
            If not specified, the submission data will not be generated.
            Default: None.
-        task: (str, optional): task for 3D detection, if cam, would filter
+        load_type (str, optional): Type of loading mode during training.
-            the points that outside the image.
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_base': Only load the instances inside the default cam,
+                and need to convert to the FOV-based data type to support
+                image-based detector.
        default_cam_key (str, optional): The default camera for lidar to
            camear conversion. By default, KITTI: CAM2, Waymo: CAM_FRONT
        use_pred_sample_idx (bool, optional): In formating results, use the
@@ -54,6 +64,11 @@ class WaymoMetric(KittiMetric):
            from different ranks during distributed training. Must be 'cpu' or
            'gpu'. Defaults to 'cpu'.
        file_client_args (dict): file client for reading gt in waymo format.
+            Defaults to ``dict(backend='disk')``.
+        idx2metainfo (Optional[str], optional): The file path of the metainfo
+            in waymmo. It stores the mapping from sample_idx to metainfo.
+            The metainfo must contain the keys: 'idx2contextname' and
+            'idx2timestamp'. Defaults to None.
    """
    num_cams = 5
@@ -64,19 +79,28 @@ class WaymoMetric(KittiMetric):
                 split: str = 'training',
                 metric: Union[str, List[str]] = 'mAP',
                 pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
+                 convert_kitti_format: bool = True,
                 prefix: Optional[str] = None,
                 pklfile_prefix: str = None,
                 submission_prefix: str = None,
-                 task='lidar',
+                 load_type: str = 'frame_based',
                 default_cam_key: str = 'CAM_FRONT',
                 use_pred_sample_idx: bool = False,
                 collect_device: str = 'cpu',
-                 file_client_args: dict = dict(backend='disk')):
+                 file_client_args: dict = dict(backend='disk'),
+                 idx2metainfo: Optional[str] = None):
        self.waymo_bin_file = waymo_bin_file
        self.data_root = data_root
        self.split = split
-        self.task = task
+        self.load_type = load_type
        self.use_pred_sample_idx = use_pred_sample_idx
+        self.convert_kitti_format = convert_kitti_format
+        if idx2metainfo is not None:
+            self.idx2metainfo = mmengine.load(idx2metainfo)
+        else:
+            self.idx2metainfo = None
        super().__init__(
            ann_file=ann_file,
            metric=metric,
@@ -100,13 +124,15 @@ class WaymoMetric(KittiMetric):
            the metrics, and the values are corresponding results.
        """
        logger: MMLogger = MMLogger.get_current_instance()
-        self.classes = self.dataset_meta['CLASSES']
+        self.classes = self.dataset_meta['classes']
        # load annotations
        self.data_infos = load(self.ann_file)['data_list']
+        assert len(results) == len(self.data_infos), \
+            'invalid list length of network outputs'
        # different from kitti, waymo do not need to convert the ann file
-        # handle the mono3d task
+        # handle the mv_image_based load_mode
-        if self.task == 'mono3d':
+        if self.load_type == 'mv_image_based':
            new_data_infos = []
            for info in self.data_infos:
                height = info['images'][self.default_cam_key]['height']
@@ -131,7 +157,7 @@ class WaymoMetric(KittiMetric):
                    # TODO check if need to modify the sample id
                    # TODO check when will use it except for evaluation.
-                    camera_info['sample_id'] = info['sample_id']
+                    camera_info['sample_idx'] = info['sample_idx']
                    new_data_infos.append(camera_info)
            self.data_infos = new_data_infos
@@ -142,8 +168,6 @@ class WaymoMetric(KittiMetric):
            eval_tmp_dir = None
            pklfile_prefix = self.pklfile_prefix
-        # load annotations
        result_dict, tmp_dir = self.format_results(
            results,
            pklfile_prefix=pklfile_prefix,
@@ -186,11 +210,7 @@ class WaymoMetric(KittiMetric):
                f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
                f'{self.waymo_bin_file}'
            print(eval_str)
-            ret_bytes = subprocess.check_output(
+            ret_bytes = subprocess.check_output(eval_str, shell=True)
-                'mmdet3d/evaluation/functional/waymo_utils/' +
-                f'compute_detection_metrics_main {pklfile_prefix}.bin ' +
-                f'{self.waymo_bin_file}',
-                shell=True)
            ret_texts = ret_bytes.decode('utf-8')
            print_log(ret_texts, logger=logger)
@@ -292,7 +312,7 @@ class WaymoMetric(KittiMetric):
                       pklfile_prefix: str = None,
                       submission_prefix: str = None,
                       classes: List[str] = None):
-        """Format the results to pkl file.
+        """Format the results to bin file.
        Args:
            results (list[dict]): Testing results of the
@@ -313,9 +333,22 @@ class WaymoMetric(KittiMetric):
                the formatted result, tmp_dir is the temporal directory created
                for saving json files when jsonfile_prefix is not specified.
        """
-        result_files, tmp_dir = super().format_results(results, pklfile_prefix,
+        waymo_save_tmp_dir = tempfile.TemporaryDirectory()
-                                                       submission_prefix,
+        waymo_results_save_dir = waymo_save_tmp_dir.name
-                                                       classes)
+        waymo_results_final_path = f'{pklfile_prefix}.bin'
+        if self.convert_kitti_format:
+            results_kitti_format, tmp_dir = super().format_results(
+                results, pklfile_prefix, submission_prefix, classes)
+            final_results = results_kitti_format['pred_instances_3d']
+        else:
+            final_results = results
+            for i, res in enumerate(final_results):
+                # Actually, `sample_idx` here is the filename without suffix.
+                # It's for identitying the sample in formating.
+                res['sample_idx'] = self.data_infos[i]['sample_idx']
+                res['pred_instances_3d']['bboxes_3d'].limit_yaw(
+                    offset=0.5, period=np.pi * 2)
        waymo_root = self.data_root
        if self.split == 'training':
@@ -326,21 +359,23 @@ class WaymoMetric(KittiMetric):
            prefix = '2'
        else:
            raise ValueError('Not supported split value.')
-        waymo_save_tmp_dir = tempfile.TemporaryDirectory()
-        waymo_results_save_dir = waymo_save_tmp_dir.name
+        from ..functional.waymo_utils.prediction_to_waymo import \
-        waymo_results_final_path = f'{pklfile_prefix}.bin'
+            Prediction2Waymo
-        from ..functional.waymo_utils.prediction_kitti_to_waymo import \
+        converter = Prediction2Waymo(
-            KITTI2Waymo
+            final_results,
-        converter = KITTI2Waymo(
-            result_files['pred_instances_3d'],
            waymo_tfrecords_dir,
            waymo_results_save_dir,
            waymo_results_final_path,
            prefix,
-            file_client_args=self.file_client_args)
+            classes,
+            file_client_args=self.file_client_args,
+            from_kitti_format=self.convert_kitti_format,
+            idx2metainfo=self.idx2metainfo)
        converter.convert()
        waymo_save_tmp_dir.cleanup()
-        return result_files, waymo_save_tmp_dir
+        return final_results, waymo_save_tmp_dir
    def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
                               cam0_info: dict):
@@ -379,7 +414,7 @@ class WaymoMetric(KittiMetric):
            torch.from_numpy(box_dict['box3d_lidar']).cuda())
        scores = torch.from_numpy(box_dict['scores']).cuda()
        labels = torch.from_numpy(box_dict['label_preds']).long().cuda()
-        nms_scores = scores.new_zeros(scores.shape[0], len(self.CLASSES) + 1)
+        nms_scores = scores.new_zeros(scores.shape[0], len(self.classes) + 1)
        indices = labels.new_tensor(list(range(scores.shape[0])))
        nms_scores[indices, labels] = scores
        lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
@@ -397,7 +432,7 @@ class WaymoMetric(KittiMetric):
        lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
        lidar2cam = np.array(lidar2cam).astype(np.float32)
        box_preds_camera = box_preds_lidar.convert_to(
-            Box3DMode.CAM, np.linalg.inv(lidar2cam), correct_yaw=True)
+            Box3DMode.CAM, lidar2cam, correct_yaw=True)
        # Note: bbox is meaningless in final evaluation, set to 0
        merged_box_dict = dict(
            bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
@@ -405,7 +440,7 @@ class WaymoMetric(KittiMetric):
            box3d_lidar=box_preds_lidar.tensor.numpy(),
            scores=scores.numpy(),
            label_preds=labels.numpy(),
-            sample_idx=box_dict['sample_id'],
+            sample_idx=box_dict['sample_idx'],
        )
        return merged_box_dict
@@ -431,8 +466,6 @@ class WaymoMetric(KittiMetric):
        Returns:
            list[dict]: A list of dictionaries with the kitti format.
        """
-        assert len(net_outputs) == len(self.data_infos), \
-            'invalid list length of network outputs'
        if submission_prefix is not None:
            mmengine.mkdir_or_exist(submission_prefix)
@@ -444,7 +477,7 @@ class WaymoMetric(KittiMetric):
            sample_idx = sample_id_list[idx]
            info = self.data_infos[sample_idx]
-            if self.task == 'mono_det':
+            if self.load_type == 'mv_image_based':
                if idx % self.num_cams == 0:
                    box_dict_per_frame = []
                    cam0_key = list(info['images'].keys())[0]
@@ -461,7 +494,7 @@ class WaymoMetric(KittiMetric):
                # If you want to use another camera, please modify it.
                image_shape = (info['images'][self.default_cam_key]['height'],
                               info['images'][self.default_cam_key]['width'])
-            if self.task == 'mono3d':
+            if self.load_type == 'mv_image_based':
                box_dict_per_frame.append(box_dict)
                if (idx + 1) % self.num_cams != 0:
                    continue
@@ -544,7 +577,7 @@ class WaymoMetric(KittiMetric):
                # In waymo validation sample_idx in prediction is 000xxx
                # but in info file it is 1000xxx
                save_sample_idx = box_dict['sample_idx']
-            annos[-1]['sample_id'] = np.array(
+            annos[-1]['sample_idx'] = np.array(
                [save_sample_idx] * len(annos[-1]['score']), dtype=np.int64)
            det_annos += annos
@@ -561,12 +594,12 @@ class WaymoMetric(KittiMetric):
    def convert_valid_bboxes(self, box_dict: dict, info: dict):
        """Convert the predicted boxes into valid ones. Should handle the
-        different task mode (mono3d, mv3d, lidar), separately.
+        load_model (frame_based, mv_image_based, fov_image_based), separately.
        Args:
            box_dict (dict): Box dictionaries to be converted.
-                - boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
+                - bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
                - scores_3d (torch.Tensor): Scores of boxes.
                - labels_3d (torch.Tensor): Class labels of boxes.
            info (dict): Data info.
@@ -587,7 +620,7 @@ class WaymoMetric(KittiMetric):
        box_preds = box_dict['bboxes_3d']
        scores = box_dict['scores_3d']
        labels = box_dict['labels_3d']
-        sample_idx = info['sample_id']
+        sample_idx = info['sample_idx']
        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
        if len(box_preds) == 0:
@@ -598,11 +631,11 @@ class WaymoMetric(KittiMetric):
                scores=np.zeros([0]),
                label_preds=np.zeros([0, 4]),
                sample_idx=sample_idx)
-        # Here default used 'CAM2' to compute metric. If you want to
+        # Here default used 'CAM_FRONT' to compute metric. If you want to
        # use another camera, please modify it.
-        if self.task in ['mv3d', 'lidar']:
+        if self.load_type in ['frame_based', 'fov_image_based']:
            cam_key = self.default_cam_key
-        elif self.task == 'mono3d':
+        elif self.load_type == 'mv_image_based':
            cam_key = list(info['images'].keys())[0]
        else:
            raise NotImplementedError
@@ -635,12 +668,12 @@ class WaymoMetric(KittiMetric):
                          (box_2d_preds[:, 1] < image_shape[0]) &
                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
        # check box_preds_lidar
-        if self.task in ['lidar', 'mono3d']:
+        if self.load_type in ['frame_based']:
            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
                              (box_preds_lidar.center < limit_range[3:]))
            valid_inds = valid_pcd_inds.all(-1)
-        elif self.task == 'mono3d':
+        if self.load_type in ['mv_image_based', 'fov_image_based']:
            valid_inds = valid_cam_inds
        if valid_inds.sum() > 0:

--- a/mmdet3d/models/backbones/__init__.py
+++ b/mmdet3d/models/backbones/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt
 from .dgcnn import DGCNNBackbone
 from .dla import DLANet
 from .mink_resnet import MinkResNet

--- a/mmdet3d/models/backbones/mink_resnet.py
+++ b/mmdet3d/models/backbones/mink_resnet.py
@@ -5,28 +5,25 @@ try:
    import MinkowskiEngine as ME
    from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
 except ImportError:
-    import warnings
-    warnings.warn(
-        'Please follow `getting_started.md` to install MinkowskiEngine.`')
    # blocks are used in the static part of MinkResNet
-    BasicBlock, Bottleneck = None, None
+    ME = BasicBlock = Bottleneck = None
 import torch.nn as nn
-from mmdet3d.models.builder import BACKBONES
+from mmdet3d.registry import MODELS
-@BACKBONES.register_module()
+@MODELS.register_module()
 class MinkResNet(nn.Module):
    r"""Minkowski ResNet backbone. See `4D Spatio-Temporal ConvNets
    <https://arxiv.org/abs/1904.08755>`_ for more details.
    Args:
        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
-        in_channels (ont): Number of input channels, 3 for RGB.
+        in_channels (int): Number of input channels, 3 for RGB.
-        num_stages (int, optional): Resnet stages. Default: 4.
+        num_stages (int): Resnet stages. Defaults to 4.
-        pool (bool, optional): Add max pooling after first conv if True.
+        pool (bool): Whether to add max pooling after first conv.
-            Default: True.
+            Defaults to True.
    """
    arch_settings = {
        18: (BasicBlock, (2, 2, 2, 2)),
@@ -38,6 +35,10 @@ class MinkResNet(nn.Module):
    def __init__(self, depth, in_channels, num_stages=4, pool=True):
        super(MinkResNet, self).__init__()
+        if ME is None:
+            raise ImportError(
+                'Please follow `getting_started.md` to install MinkowskiEngine.`'  # noqa: E501
+            )
        if depth not in self.arch_settings:
            raise KeyError(f'invalid depth {depth} for resnet')
        assert 4 >= num_stages >= 1
@@ -58,7 +59,7 @@ class MinkResNet(nn.Module):
        for i, num_blocks in enumerate(stage_blocks):
            setattr(
-                self, f'layer{i}',
+                self, f'layer{i + 1}',
                self._make_layer(block, 64 * 2**i, stage_blocks[i], stride=2))
    def init_weights(self):
@@ -111,6 +112,6 @@ class MinkResNet(nn.Module):
            x = self.maxpool(x)
        outs = []
        for i in range(self.num_stages):
-            x = getattr(self, f'layer{i}')(x)
+            x = getattr(self, f'layer{i + 1}')(x)
            outs.append(x)
        return outs
--- a/mmdet3d/models/backbones/nostem_regnet.py
+++ b/mmdet3d/models/backbones/nostem_regnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet3d.registry import MODELS
 from mmdet.models.backbones import RegNet
+from mmdet3d.registry import MODELS
 @MODELS.register_module()
 class NoStemRegNet(RegNet):

--- a/mmdet3d/models/data_preprocessors/data_preprocessor.py
+++ b/mmdet3d/models/data_preprocessors/data_preprocessor.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
 from numbers import Number
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Union
 import numpy as np
 import torch
 from mmcv.ops import Voxelization
+from mmdet.models import DetDataPreprocessor
 from mmengine.model import stack_batch
 from mmengine.utils import is_list_of
 from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.utils import OptConfigType
-from mmdet.models import DetDataPreprocessor
 from .utils import multiview_img_stack_batch
@@ -28,24 +28,25 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
    - 1) For image data:
    - Pad images in inputs to the maximum size of current batch with defined
      ``pad_value``. The padding size can be divisible by a defined
-      ``pad_size_divisor``
+      ``pad_size_divisor``.
    - Stack images in inputs to batch_imgs.
    - Convert images in inputs from bgr to rgb if the shape of input is
-        (3, H, W).
+      (3, H, W).
    - Normalize images in inputs with defined std and mean.
    - Do batch augmentations during training.
    - 2) For point cloud data:
-    - if no voxelization, directly return list of point cloud data.
+    - If no voxelization, directly return list of point cloud data.
-    - if voxelization is applied, voxelize point cloud according to
+    - If voxelization is applied, voxelize point cloud according to
      ``voxel_type`` and obtain ``voxels``.
    Args:
-        voxel (bool): Whether to apply voxelziation to point cloud.
+        voxel (bool): Whether to apply voxelization to point cloud.
+            Defaults to False.
        voxel_type (str): Voxelization type. Two voxelization types are
            provided: 'hard' and 'dynamic', respectively for hard
            voxelization and dynamic voxelization. Defaults to 'hard'.
-        voxel_layer (:obj:`ConfigDict`, optional): Voxelization layer
+        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
            config. Defaults to None.
        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
            Defaults to None.
@@ -54,11 +55,21 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        pad_size_divisor (int): The size of padded image should be
            divisible by ``pad_size_divisor``. Defaults to 1.
        pad_value (Number): The padded pixel value. Defaults to 0.
-        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
            Defaults to False.
-        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
            Defaults to False.
-        batch_augments (list[dict], optional): Batch-level augmentations
+        boxtype2tensor (bool): Whether to keep the ``BaseBoxes`` type of
+            bboxes data or not. Defaults to True.
+        batch_augments (List[dict], optional): Batch-level augmentations.
+            Defaults to None.
    """
    def __init__(self,
@@ -76,8 +87,8 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
                 bgr_to_rgb: bool = False,
                 rgb_to_bgr: bool = False,
                 boxtype2tensor: bool = True,
-                 batch_augments: Optional[List[dict]] = None):
+                 batch_augments: Optional[List[dict]] = None) -> None:
-        super().__init__(
+        super(Det3DDataPreprocessor, self).__init__(
            mean=mean,
            std=std,
            pad_size_divisor=pad_size_divisor,
@@ -94,24 +105,21 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        if voxel:
            self.voxel_layer = Voxelization(**voxel_layer)
-    def forward(
+    def forward(self,
-        self,
+                data: Union[dict, List[dict]],
-        data: Union[dict, List[dict]],
+                training: bool = False) -> Union[dict, List[dict]]:
-        training: bool = False
+        """Perform normalization, padding and bgr2rgb conversion based on
-    ) -> Tuple[Union[dict, List[dict]], Optional[list]]:
-        """Perform normalization、padding and bgr2rgb conversion based on
        ``BaseDataPreprocessor``.
        Args:
-            data (dict | List[dict]): data from dataloader.
+            data (dict or List[dict]): Data from dataloader.
                The dict contains the whole batch data, when it is
                a list[dict], the list indicate test time augmentation.
            training (bool): Whether to enable training time augmentation.
                Defaults to False.
        Returns:
-            Dict | List[Dict]: Data in the same format as the model input.
+            dict or List[dict]: Data in the same format as the model input.
        """
        if isinstance(data, list):
            num_augs = len(data)
@@ -126,7 +134,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            return self.simple_process(data, training)
    def simple_process(self, data: dict, training: bool = False) -> dict:
-        """Perform normalization、padding and bgr2rgb conversion for img data
+        """Perform normalization, padding and bgr2rgb conversion for img data
        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
        is set to be True.
@@ -188,7 +196,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        return {'inputs': batch_inputs, 'data_samples': data_samples}
-    def preprocess_img(self, _batch_img):
+    def preprocess_img(self, _batch_img: torch.Tensor) -> torch.Tensor:
        # channel transform
        if self._channel_conversion:
            _batch_img = _batch_img[[2, 1, 0], ...]
@@ -206,7 +214,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        return _batch_img
    def collate_data(self, data: dict) -> dict:
-        """Copying data to the target device and Performs normalization、
+        """Copying data to the target device and Performs normalization,
        padding and bgr2rgb conversion and stack based on
        ``BaseDataPreprocessor``.
@@ -273,7 +281,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
                raise TypeError(
                    'Output of `cast_data` should be a list of dict '
                    'or a tuple with inputs and data_samples, but got'
-                    f'{type(data)}： {data}')
+                    f'{type(data)}: {data}')
            data['inputs']['imgs'] = batch_imgs
@@ -284,14 +292,14 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
    def _get_pad_shape(self, data: dict) -> List[tuple]:
        """Get the pad_shape of each image based on data and
        pad_size_divisor."""
-        # rewrite `_get_pad_shape` for obaining image inputs.
+        # rewrite `_get_pad_shape` for obtaining image inputs.
        _batch_inputs = data['inputs']['img']
        # Process data with `pseudo_collate`.
        if is_list_of(_batch_inputs, torch.Tensor):
            batch_pad_shape = []
            for ori_input in _batch_inputs:
                if ori_input.dim() == 4:
-                    # mean multiivew input, select ont of the
+                    # mean multiview input, select one of the
                    # image to calculate the pad shape
                    ori_input = ori_input[0]
                pad_h = int(
@@ -316,24 +324,24 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
        else:
            raise TypeError('Output of `cast_data` should be a list of dict '
-                            'or a tuple with inputs and data_samples, but got'
+                            'or a tuple with inputs and data_samples, but got '
                            f'{type(data)}: {data}')
        return batch_pad_shape
    @torch.no_grad()
-    def voxelize(self, points: List[torch.Tensor]) -> Dict:
+    def voxelize(self, points: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
        """Apply voxelization to point cloud.
        Args:
            points (List[Tensor]): Point cloud in one data batch.
        Returns:
-            dict[str, Tensor]: Voxelization information.
+            Dict[str, Tensor]: Voxelization information.
-            - voxels (Tensor): Features of voxels, shape is MXNxC for hard
+            - voxels (Tensor): Features of voxels, shape is MxNxC for hard
-                voxelization, NXC for dynamic voxelization.
+              voxelization, NxC for dynamic voxelization.
-            - coors (Tensor): Coordinates of voxels, shape is  Nx(1+NDim),
+            - coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
-                where 1 represents the batch index.
+              where 1 represents the batch index.
            - num_points (Tensor, optional): Number of points in each voxel.
            - voxel_centers (Tensor, optional): Centers of voxels.
        """
@@ -342,43 +350,38 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
        if self.voxel_type == 'hard':
            voxels, coors, num_points, voxel_centers = [], [], [], []
-            for res in points:
+            for i, res in enumerate(points):
                res_voxels, res_coors, res_num_points = self.voxel_layer(res)
                res_voxel_centers = (
                    res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
                        self.voxel_layer.voxel_size) + res_voxels.new_tensor(
                            self.voxel_layer.point_cloud_range[0:3])
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
                voxels.append(res_voxels)
                coors.append(res_coors)
                num_points.append(res_num_points)
                voxel_centers.append(res_voxel_centers)
            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
            num_points = torch.cat(num_points, dim=0)
            voxel_centers = torch.cat(voxel_centers, dim=0)
-            coors_batch = []
-            for i, coor in enumerate(coors):
-                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-                coors_batch.append(coor_pad)
-            coors_batch = torch.cat(coors_batch, dim=0)
            voxel_dict['num_points'] = num_points
            voxel_dict['voxel_centers'] = voxel_centers
        elif self.voxel_type == 'dynamic':
            coors = []
            # dynamic voxelization only provide a coors mapping
-            for res in points:
+            for i, res in enumerate(points):
                res_coors = self.voxel_layer(res)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
                coors.append(res_coors)
            voxels = torch.cat(points, dim=0)
-            coors_batch = []
+            coors = torch.cat(coors, dim=0)
-            for i, coor in enumerate(coors):
-                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-                coors_batch.append(coor_pad)
-            coors_batch = torch.cat(coors_batch, dim=0)
        else:
            raise ValueError(f'Invalid voxelization type {self.voxel_type}')
        voxel_dict['voxels'] = voxels
-        voxel_dict['coors'] = coors_batch
+        voxel_dict['coors'] = coors
        return voxel_dict
--- a/mmdet3d/models/data_preprocessors/utils.py
+++ b/mmdet3d/models/data_preprocessors/utils.py
@@ -12,7 +12,7 @@ def multiview_img_stack_batch(
    """
    Compared to the stack_batch in mmengine.model.utils,
    multiview_img_stack_batch further handle the multiview images.
-    see diff of padded_sizes[:, :-2] = 0 vs padded_sizees[:, 0] = 0 in line 47
+    see diff of padded_sizes[:, :-2] = 0 vs padded_sizes[:, 0] = 0 in line 47
    Stack multiple tensors to form a batch and pad the tensor to the max
    shape use the right bottom padding mode in these images. If
    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
@@ -23,20 +23,20 @@ def multiview_img_stack_batch(
        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
            to ensure the shape of each dim is divisible by
            ``pad_size_divisor``. This depends on the model, and many
-            models need to be divisible by 32. Defaults to 1
+            models need to be divisible by 32. Defaults to 1.
-        pad_value (int, float): The padding value. Defaults to 0.
+        pad_value (int or float): The padding value. Defaults to 0.
    Returns:
        Tensor: The n dim tensor.
    """
    assert isinstance(
        tensor_list,
-        list), (f'Expected input type to be list, but got {type(tensor_list)}')
+        list), f'Expected input type to be list, but got {type(tensor_list)}'
    assert tensor_list, '`tensor_list` could not be an empty list'
    assert len({
        tensor.ndim
        for tensor in tensor_list
-    }) == 1, (f'Expected the dimensions of all tensors must be the same, '
+    }) == 1, ('Expected the dimensions of all tensors must be the same, '
              f'but got {[tensor.ndim for tensor in tensor_list]}')
    dim = tensor_list[0].dim()
@@ -46,7 +46,7 @@ def multiview_img_stack_batch(
    max_sizes = torch.ceil(
        torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
    padded_sizes = max_sizes - all_sizes
-    # The first dim normally means channel,  which should not be padded.
+    # The first dim normally means channel, which should not be padded.
    padded_sizes[:, :-2] = 0
    if padded_sizes.sum() == 0:
        return torch.stack(tensor_list)

--- a/mmdet3d/models/decode_heads/decode_head.py
+++ b/mmdet3d/models/decode_heads/decode_head.py
@@ -41,19 +41,20 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
    Args:
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
-        dropout_ratio (float, optional): Ratio of dropout layer. Default: 0.5.
+        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.5.
-        conv_cfg (dict, optional): Config of conv layers.
+        conv_cfg (dict): Config of conv layers.
-            Default: dict(type='Conv1d').
+            Defaults to dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of norm layers.
+        norm_cfg (dict): Config of norm layers.
-            Default: dict(type='BN1d').
+            Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Config of activation layers.
+        act_cfg (dict): Config of activation layers.
-            Default: dict(type='ReLU').
+            Defaults to dict(type='ReLU').
-        loss_decode (dict, optional): Config of decode loss.
+        loss_decode (dict): Config of decode loss.
-            Default: dict(type='CrossEntropyLoss').
+            Defaults to dict(type='CrossEntropyLoss').
-        ignore_index (int, optional): The label index to be ignored.
+        ignore_index (int): The label index to be ignored.
            When using masked BCE loss, ignore_index should be set to None.
-            Default: 255.
+            Defaults to 255.
        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
    """
    def __init__(self,
@@ -86,8 +87,6 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        else:
            self.dropout = None
-        self.fp16_enabled = False
    def init_weights(self):
        """Initialize weights of classification layer."""
        super().init_weights()
@@ -105,15 +104,15 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        output = self.conv_seg(feat)
        return output
-    def loss(self, inputs: List[Tensor],
+    def loss(self, inputs: List[Tensor], batch_data_samples: SampleList,
-             batch_data_samples: SampleList) -> dict:
+             train_cfg: ConfigType) -> dict:
        """Forward function for training.
        Args:
            inputs (list[torch.Tensor]): List of multi-level point features.
-            img_metas (list[dict]): Meta information of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
-            pts_semantic_mask (torch.Tensor): Semantic segmentation masks
+                data samples. It usually includes information such
-                used if the architecture supports semantic segmentation task.
+                as `metainfo` and `gt_pts_seg`.
            train_cfg (dict): The training config.
        Returns:
@@ -129,7 +128,9 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        Args:
            inputs (list[Tensor]): List of multi-level point features.
-            batch_img_metas (list[dict]): Meta information of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_pts_seg`.
            test_cfg (dict): The testing config.
        Returns:

--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
@@ -5,6 +5,7 @@ from .base_3d_dense_head import Base3DDenseHead
 from .base_conv_bbox_head import BaseConvBboxHead
 from .base_mono3d_dense_head import BaseMono3DDenseHead
 from .centerpoint_head import CenterHead
+from .fcaf3d_head import FCAF3DHead
 from .fcos_mono3d_head import FCOSMono3DHead
 from .free_anchor3d_head import FreeAnchor3DHead
 from .groupfree3d_head import GroupFree3DHead
@@ -22,5 +23,5 @@ __all__ = [
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
-    'MonoFlexHead', 'Base3DDenseHead'
+    'MonoFlexHead', 'Base3DDenseHead', 'FCAF3DHead'
 ]
--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
@@ -4,6 +4,7 @@ from typing import List, Tuple
 import numpy as np
 import torch
+from mmdet.models.utils import multi_apply
 from torch import Tensor
 from torch import nn as nn
@@ -12,7 +13,6 @@ from mmdet3d.models.test_time_augs import merge_aug_bboxes_3d
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.utils.typing import (ConfigType, InstanceList, OptConfigType,
                                  OptInstanceList)
-from mmdet.models.utils import multi_apply
 from .base_3d_dense_head import Base3DDenseHead
 from .train_mixins import AnchorTrainMixin

--- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
@@ -4,13 +4,13 @@ from typing import Any, List, Sequence, Tuple, Union
 import torch
 from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
 from mmengine.model import bias_init_with_prob, normal_init
 from torch import Tensor
 from torch import nn as nn
 from mmdet3d.registry import MODELS
 from mmdet3d.utils import ConfigType, InstanceList, OptConfigType
-from mmdet.models.utils import multi_apply
 from .base_mono3d_dense_head import BaseMono3DDenseHead

--- a/mmdet3d/models/dense_heads/base_3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_3d_dense_head.py
@@ -4,6 +4,7 @@ from typing import List, Optional, Tuple
 import numpy as np
 import torch
+from mmdet.models.utils import select_single_mlvl
 from mmengine.config import ConfigDict
 from mmengine.model import BaseModule, constant_init
 from mmengine.structures import InstanceData
@@ -13,7 +14,6 @@ from mmdet3d.models.layers import box3d_multiclass_nms
 from mmdet3d.structures import limit_period, xywhr2xyxyr
 from mmdet3d.structures.det3d_data_sample import SampleList
 from mmdet3d.utils.typing import InstanceList, OptMultiConfig
-from mmdet.models.utils import select_single_mlvl
 class Base3DDenseHead(BaseModule, metaclass=ABCMeta):

--- a/mmdet3d/models/dense_heads/centerpoint_head.py
+++ b/mmdet3d/models/dense_heads/centerpoint_head.py
@@ -4,6 +4,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import torch
 from mmcv.cnn import ConvModule, build_conv_layer
+from mmdet.models.utils import multi_apply
 from mmengine.model import BaseModule
 from mmengine.structures import InstanceData
 from torch import Tensor, nn
@@ -12,7 +13,6 @@ from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
                                  gaussian_radius)
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
-from mmdet.models.utils import multi_apply
 from .. import builder
 from ..layers import circle_nms, nms_bev

--- a/mmdet3d/models/dense_heads/fcaf3d_head.py
+++ b/mmdet3d/models/dense_heads/fcaf3d_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/dense_heads/fcaf3d_neck_with_head.py # noqa
+from typing import List, Optional, Tuple
+try:
+    import MinkowskiEngine as ME
+    from MinkowskiEngine import SparseTensor
+except ImportError:
+    # Please follow getting_started.md to install MinkowskiEngine.
+    ME = SparseTensor = None
+    pass
+import torch
+from mmcv.cnn import Scale
+from mmcv.ops import nms3d, nms3d_normal
+from mmdet.utils import reduce_mean
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import BaseInstance3DBoxes, rotation_3d_in_axis
+from mmdet3d.utils import InstanceList, OptInstanceList
+from .base_3d_dense_head import Base3DDenseHead
+@MODELS.register_module()
+class FCAF3DHead(Base3DDenseHead):
+    r"""Bbox head of `FCAF3D <https://arxiv.org/abs/2112.00322>`_.
+    Actually here we store both the sparse 3D FPN and a head. The neck and
+    the head can not be simply separated as pruning score on the i-th level
+    of FPN requires classification scores from i+1-th level of the head.
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in input tensors.
+        out_channels (int): Number of channels in the neck output tensors.
+        num_reg_outs (int): Number of regression layer channels.
+        voxel_size (float): Voxel size in meters.
+        pts_prune_threshold (int): Pruning threshold on each feature level.
+        pts_assign_threshold (int): Box to location assigner parameter.
+            Assigner selects the maximum feature level with more locations
+            inside the box than pts_assign_threshold.
+        pts_center_threshold (int): Box to location assigner parameter.
+            After feature level for the box is determined, assigner selects
+            pts_center_threshold locations closest to the box center.
+        center_loss (dict): Config of centerness loss. Defaults to
+            dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True).
+        bbox_loss (dict): Config of bbox loss. Defaults to
+            dict(type='AxisAlignedIoULoss').
+        cls_loss (dict): Config of classification loss. Defaults to
+            dict = dict(type='mmdet.FocalLoss').
+        train_cfg (dict, optional): Config for train stage. Defaults to None.
+        test_cfg (dict, optional): Config for test stage. Defaults to None.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 out_channels: int,
+                 num_reg_outs: int,
+                 voxel_size: float,
+                 pts_prune_threshold: int,
+                 pts_assign_threshold: int,
+                 pts_center_threshold: int,
+                 center_loss: dict = dict(
+                     type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+                 bbox_loss: dict = dict(type='AxisAlignedIoULoss'),
+                 cls_loss: dict = dict(type='mmdet.FocalLoss'),
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(FCAF3DHead, self).__init__(init_cfg)
+        if ME is None:
+            raise ImportError(
+                'Please follow `getting_started.md` to install MinkowskiEngine.`'  # noqa: E501
+            )
+        self.voxel_size = voxel_size
+        self.pts_prune_threshold = pts_prune_threshold
+        self.pts_assign_threshold = pts_assign_threshold
+        self.pts_center_threshold = pts_center_threshold
+        self.center_loss = MODELS.build(center_loss)
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(in_channels, out_channels, num_reg_outs, num_classes)
+    @staticmethod
+    def _make_block(in_channels: int, out_channels: int) -> nn.Module:
+        """Construct Conv-Norm-Act block.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+        Returns:
+            torch.nn.Module: With corresponding layers.
+        """
+        return nn.Sequential(
+            ME.MinkowskiConvolution(
+                in_channels, out_channels, kernel_size=3, dimension=3),
+            ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU())
+    @staticmethod
+    def _make_up_block(in_channels: int, out_channels: int) -> nn.Module:
+        """Construct DeConv-Norm-Act-Conv-Norm-Act block.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+        Returns:
+            torch.nn.Module: With corresponding layers.
+        """
+        return nn.Sequential(
+            ME.MinkowskiGenerativeConvolutionTranspose(
+                in_channels,
+                out_channels,
+                kernel_size=2,
+                stride=2,
+                dimension=3), ME.MinkowskiBatchNorm(out_channels),
+            ME.MinkowskiELU(),
+            ME.MinkowskiConvolution(
+                out_channels, out_channels, kernel_size=3, dimension=3),
+            ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU())
+    def _init_layers(self, in_channels: Tuple[int], out_channels: int,
+                     num_reg_outs: int, num_classes: int):
+        """Initialize layers.
+        Args:
+            in_channels (tuple[int]): Number of channels in input tensors.
+            out_channels (int): Number of channels in the neck output tensors.
+            num_reg_outs (int): Number of regression layer channels.
+            num_classes (int): Number of classes.
+        """
+        # neck layers
+        self.pruning = ME.MinkowskiPruning()
+        for i in range(len(in_channels)):
+            if i > 0:
+                self.__setattr__(
+                    f'up_block_{i}',
+                    self._make_up_block(in_channels[i], in_channels[i - 1]))
+            self.__setattr__(f'out_block_{i}',
+                             self._make_block(in_channels[i], out_channels))
+        # head layers
+        self.conv_center = ME.MinkowskiConvolution(
+            out_channels, 1, kernel_size=1, dimension=3)
+        self.conv_reg = ME.MinkowskiConvolution(
+            out_channels, num_reg_outs, kernel_size=1, dimension=3)
+        self.conv_cls = ME.MinkowskiConvolution(
+            out_channels, num_classes, kernel_size=1, bias=True, dimension=3)
+        self.scales = nn.ModuleList(
+            [Scale(1.) for _ in range(len(in_channels))])
+    def init_weights(self):
+        """Initialize weights."""
+        nn.init.normal_(self.conv_center.kernel, std=.01)
+        nn.init.normal_(self.conv_reg.kernel, std=.01)
+        nn.init.normal_(self.conv_cls.kernel, std=.01)
+        nn.init.constant_(self.conv_cls.bias, bias_init_with_prob(.01))
+    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], ...]:
+        """Forward pass.
+        Args:
+            x (list[Tensor]): Features from the backbone.
+        Returns:
+            Tuple[List[Tensor], ...]: Predictions of the head.
+        """
+        center_preds, bbox_preds, cls_preds, points = [], [], [], []
+        inputs = x
+        x = inputs[-1]
+        prune_score = None
+        for i in range(len(inputs) - 1, -1, -1):
+            if i < len(inputs) - 1:
+                x = self.__getattr__(f'up_block_{i + 1}')(x)
+                x = inputs[i] + x
+                x = self._prune(x, prune_score)
+            out = self.__getattr__(f'out_block_{i}')(x)
+            center_pred, bbox_pred, cls_pred, point, prune_score = \
+                self._forward_single(out, self.scales[i])
+            center_preds.append(center_pred)
+            bbox_preds.append(bbox_pred)
+            cls_preds.append(cls_pred)
+            points.append(point)
+        return center_preds[::-1], bbox_preds[::-1], cls_preds[::-1], \
+            points[::-1]
+    def _prune(self, x: SparseTensor, scores: SparseTensor) -> SparseTensor:
+        """Prunes the tensor by score thresholding.
+        Args:
+            x (SparseTensor): Tensor to be pruned.
+            scores (SparseTensor): Scores for thresholding.
+        Returns:
+            SparseTensor: Pruned tensor.
+        """
+        with torch.no_grad():
+            coordinates = x.C.float()
+            interpolated_scores = scores.features_at_coordinates(coordinates)
+            prune_mask = interpolated_scores.new_zeros(
+                (len(interpolated_scores)), dtype=torch.bool)
+            for permutation in x.decomposition_permutations:
+                score = interpolated_scores[permutation]
+                mask = score.new_zeros((len(score)), dtype=torch.bool)
+                topk = min(len(score), self.pts_prune_threshold)
+                ids = torch.topk(score.squeeze(1), topk, sorted=False).indices
+                mask[ids] = True
+                prune_mask[permutation[mask]] = True
+        x = self.pruning(x, prune_mask)
+        return x
+    def _forward_single(self, x: SparseTensor,
+                        scale: Scale) -> Tuple[Tensor, ...]:
+        """Forward pass per level.
+        Args:
+            x (SparseTensor): Per level neck output tensor.
+            scale (mmcv.cnn.Scale): Per level multiplication weight.
+        Returns:
+            tuple[Tensor]: Per level head predictions.
+        """
+        center_pred = self.conv_center(x).features
+        scores = self.conv_cls(x)
+        cls_pred = scores.features
+        prune_scores = ME.SparseTensor(
+            scores.features.max(dim=1, keepdim=True).values,
+            coordinate_map_key=scores.coordinate_map_key,
+            coordinate_manager=scores.coordinate_manager)
+        reg_final = self.conv_reg(x).features
+        reg_distance = torch.exp(scale(reg_final[:, :6]))
+        reg_angle = reg_final[:, 6:]
+        bbox_pred = torch.cat((reg_distance, reg_angle), dim=1)
+        center_preds, bbox_preds, cls_preds, points = [], [], [], []
+        for permutation in x.decomposition_permutations:
+            center_preds.append(center_pred[permutation])
+            bbox_preds.append(bbox_pred[permutation])
+            cls_preds.append(cls_pred[permutation])
+        points = x.decomposed_coordinates
+        for i in range(len(points)):
+            points[i] = points[i] * self.voxel_size
+        return center_preds, bbox_preds, cls_preds, points, prune_scores
+    def _loss_by_feat_single(self, center_preds: List[Tensor],
+                             bbox_preds: List[Tensor], cls_preds: List[Tensor],
+                             points: List[Tensor],
+                             gt_bboxes: BaseInstance3DBoxes, gt_labels: Tensor,
+                             input_meta: dict) -> Tuple[Tensor, ...]:
+        """Loss function of single sample.
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            points (list[Tensor]): Final location coordinates for all levels.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+            input_meta (dict): Scene meta info.
+        Returns:
+            tuple[Tensor, ...]: Centerness, bbox, and classification loss
+            values.
+        """
+        center_targets, bbox_targets, cls_targets = self.get_targets(
+            points, gt_bboxes, gt_labels)
+        center_preds = torch.cat(center_preds)
+        bbox_preds = torch.cat(bbox_preds)
+        cls_preds = torch.cat(cls_preds)
+        points = torch.cat(points)
+        # cls loss
+        pos_inds = torch.nonzero(cls_targets >= 0).squeeze(1)
+        n_pos = points.new_tensor(len(pos_inds))
+        n_pos = max(reduce_mean(n_pos), 1.)
+        cls_loss = self.cls_loss(cls_preds, cls_targets, avg_factor=n_pos)
+        # bbox and centerness losses
+        pos_center_preds = center_preds[pos_inds]
+        pos_bbox_preds = bbox_preds[pos_inds]
+        pos_center_targets = center_targets[pos_inds].unsqueeze(1)
+        pos_bbox_targets = bbox_targets[pos_inds]
+        # reduce_mean is outside if / else block to prevent deadlock
+        center_denorm = max(
+            reduce_mean(pos_center_targets.sum().detach()), 1e-6)
+        if len(pos_inds) > 0:
+            pos_points = points[pos_inds]
+            center_loss = self.center_loss(
+                pos_center_preds, pos_center_targets, avg_factor=n_pos)
+            bbox_loss = self.bbox_loss(
+                self._bbox_to_loss(
+                    self._bbox_pred_to_bbox(pos_points, pos_bbox_preds)),
+                self._bbox_to_loss(pos_bbox_targets),
+                weight=pos_center_targets.squeeze(1),
+                avg_factor=center_denorm)
+        else:
+            center_loss = pos_center_preds.sum()
+            bbox_loss = pos_bbox_preds.sum()
+        return center_loss, bbox_loss, cls_loss
+    def loss_by_feat(self,
+                     center_preds: List[List[Tensor]],
+                     bbox_preds: List[List[Tensor]],
+                     cls_preds: List[List[Tensor]],
+                     points: List[List[Tensor]],
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     **kwargs) -> dict:
+        """Loss function about feature.
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+                The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            points (list[list[Tensor]]): Final location coordinates for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict: Centerness, bbox, and classification losses.
+        """
+        center_losses, bbox_losses, cls_losses = [], [], []
+        for i in range(len(batch_input_metas)):
+            center_loss, bbox_loss, cls_loss = self._loss_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                points=[x[i] for x in points],
+                input_meta=batch_input_metas[i],
+                gt_bboxes=batch_gt_instances_3d[i].bboxes_3d,
+                gt_labels=batch_gt_instances_3d[i].labels_3d)
+            center_losses.append(center_loss)
+            bbox_losses.append(bbox_loss)
+            cls_losses.append(cls_loss)
+        return dict(
+            center_loss=torch.mean(torch.stack(center_losses)),
+            bbox_loss=torch.mean(torch.stack(bbox_losses)),
+            cls_loss=torch.mean(torch.stack(cls_losses)))
+    def _predict_by_feat_single(self, center_preds: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                cls_preds: List[Tensor], points: List[Tensor],
+                                input_meta: dict) -> InstanceData:
+        """Generate boxes for single sample.
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            points (list[Tensor]): Final location coordinates for all levels.
+            input_meta (dict): Scene meta info.
+        Returns:
+            InstanceData: Predicted bounding boxes, scores and labels.
+        """
+        mlvl_bboxes, mlvl_scores = [], []
+        for center_pred, bbox_pred, cls_pred, point in zip(
+                center_preds, bbox_preds, cls_preds, points):
+            scores = cls_pred.sigmoid() * center_pred.sigmoid()
+            max_scores, _ = scores.max(dim=1)
+            if len(scores) > self.test_cfg.nms_pre > 0:
+                _, ids = max_scores.topk(self.test_cfg.nms_pre)
+                bbox_pred = bbox_pred[ids]
+                scores = scores[ids]
+                point = point[ids]
+            bboxes = self._bbox_pred_to_bbox(point, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        bboxes = torch.cat(mlvl_bboxes)
+        scores = torch.cat(mlvl_scores)
+        bboxes, scores, labels = self._single_scene_multiclass_nms(
+            bboxes, scores, input_meta)
+        bboxes = input_meta['box_type_3d'](
+            bboxes,
+            box_dim=bboxes.shape[1],
+            with_yaw=bboxes.shape[1] == 7,
+            origin=(.5, .5, .5))
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+    def predict_by_feat(self, center_preds: List[List[Tensor]],
+                        bbox_preds: List[List[Tensor]], cls_preds,
+                        points: List[List[Tensor]],
+                        batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate boxes for all scenes.
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes.
+            points (list[list[Tensor]]): Final location coordinates for all
+                scenes.
+            batch_input_metas (list[dict]): Meta infos for all scenes.
+        Returns:
+            list[InstanceData]: Predicted bboxes, scores, and labels for
+            all scenes.
+        """
+        results = []
+        for i in range(len(batch_input_metas)):
+            result = self._predict_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                points=[x[i] for x in points],
+                input_meta=batch_input_metas[i])
+            results.append(result)
+        return results
+    @staticmethod
+    def _bbox_to_loss(bbox: Tensor) -> Tensor:
+        """Transform box to the axis-aligned or rotated iou loss format.
+        Args:
+            bbox (Tensor): 3D box of shape (N, 6) or (N, 7).
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 6) or (N, 7).
+        """
+        # rotated iou loss accepts (x, y, z, w, h, l, heading)
+        if bbox.shape[-1] != 6:
+            return bbox
+        # axis-aligned case: x, y, z, w, h, l -> x1, y1, z1, x2, y2, z2
+        return torch.stack(
+            (bbox[..., 0] - bbox[..., 3] / 2, bbox[..., 1] - bbox[..., 4] / 2,
+             bbox[..., 2] - bbox[..., 5] / 2, bbox[..., 0] + bbox[..., 3] / 2,
+             bbox[..., 1] + bbox[..., 4] / 2, bbox[..., 2] + bbox[..., 5] / 2),
+            dim=-1)
+    @staticmethod
+    def _bbox_pred_to_bbox(points: Tensor, bbox_pred: Tensor) -> Tensor:
+        """Transform predicted bbox parameters to bbox.
+        Args:
+            points (Tensor): Final locations of shape (N, 3)
+            bbox_pred (Tensor): Predicted bbox parameters of shape (N, 6)
+                or (N, 8).
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 6) or (N, 7).
+        """
+        if bbox_pred.shape[0] == 0:
+            return bbox_pred
+        x_center = points[:, 0] + (bbox_pred[:, 1] - bbox_pred[:, 0]) / 2
+        y_center = points[:, 1] + (bbox_pred[:, 3] - bbox_pred[:, 2]) / 2
+        z_center = points[:, 2] + (bbox_pred[:, 5] - bbox_pred[:, 4]) / 2
+        # dx_min, dx_max, dy_min, dy_max, dz_min, dz_max -> x, y, z, w, l, h
+        base_bbox = torch.stack([
+            x_center,
+            y_center,
+            z_center,
+            bbox_pred[:, 0] + bbox_pred[:, 1],
+            bbox_pred[:, 2] + bbox_pred[:, 3],
+            bbox_pred[:, 4] + bbox_pred[:, 5],
+        ], -1)
+        # axis-aligned case
+        if bbox_pred.shape[1] == 6:
+            return base_bbox
+        # rotated case: ..., sin(2a)ln(q), cos(2a)ln(q)
+        scale = bbox_pred[:, 0] + bbox_pred[:, 1] + \
+            bbox_pred[:, 2] + bbox_pred[:, 3]
+        q = torch.exp(
+            torch.sqrt(
+                torch.pow(bbox_pred[:, 6], 2) + torch.pow(bbox_pred[:, 7], 2)))
+        alpha = 0.5 * torch.atan2(bbox_pred[:, 6], bbox_pred[:, 7])
+        return torch.stack(
+            (x_center, y_center, z_center, scale / (1 + q), scale /
+             (1 + q) * q, bbox_pred[:, 5] + bbox_pred[:, 4], alpha),
+            dim=-1)
+    @staticmethod
+    def _get_face_distances(points: Tensor, boxes: Tensor) -> Tensor:
+        """Calculate distances from point to box faces.
+        Args:
+            points (Tensor): Final locations of shape (N_points, N_boxes, 3).
+            boxes (Tensor): 3D boxes of shape (N_points, N_boxes, 7)
+        Returns:
+            Tensor: Face distances of shape (N_points, N_boxes, 6),
+            (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+        """
+        shift = torch.stack(
+            (points[..., 0] - boxes[..., 0], points[..., 1] - boxes[..., 1],
+             points[..., 2] - boxes[..., 2]),
+            dim=-1).permute(1, 0, 2)
+        shift = rotation_3d_in_axis(
+            shift, -boxes[0, :, 6], axis=2).permute(1, 0, 2)
+        centers = boxes[..., :3] + shift
+        dx_min = centers[..., 0] - boxes[..., 0] + boxes[..., 3] / 2
+        dx_max = boxes[..., 0] + boxes[..., 3] / 2 - centers[..., 0]
+        dy_min = centers[..., 1] - boxes[..., 1] + boxes[..., 4] / 2
+        dy_max = boxes[..., 1] + boxes[..., 4] / 2 - centers[..., 1]
+        dz_min = centers[..., 2] - boxes[..., 2] + boxes[..., 5] / 2
+        dz_max = boxes[..., 2] + boxes[..., 5] / 2 - centers[..., 2]
+        return torch.stack((dx_min, dx_max, dy_min, dy_max, dz_min, dz_max),
+                           dim=-1)
+    @staticmethod
+    def _get_centerness(face_distances: Tensor) -> Tensor:
+        """Compute point centerness w.r.t containing box.
+        Args:
+            face_distances (Tensor): Face distances of shape (B, N, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+        Returns:
+            Tensor: Centerness of shape (B, N).
+        """
+        x_dims = face_distances[..., [0, 1]]
+        y_dims = face_distances[..., [2, 3]]
+        z_dims = face_distances[..., [4, 5]]
+        centerness_targets = x_dims.min(dim=-1)[0] / x_dims.max(dim=-1)[0] * \
+            y_dims.min(dim=-1)[0] / y_dims.max(dim=-1)[0] * \
+            z_dims.min(dim=-1)[0] / z_dims.max(dim=-1)[0]
+        return torch.sqrt(centerness_targets)
+    @torch.no_grad()
+    def get_targets(self, points: Tensor, gt_bboxes: BaseInstance3DBoxes,
+                    gt_labels: Tensor) -> Tuple[Tensor, ...]:
+        """Compute targets for final locations for a single scene.
+        Args:
+            points (list[Tensor]): Final locations for all levels.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+        Returns:
+            tuple[Tensor, ...]: Centerness, bbox and classification
+            targets for all locations.
+        """
+        float_max = points[0].new_tensor(1e8)
+        n_levels = len(points)
+        levels = torch.cat([
+            points[i].new_tensor(i).expand(len(points[i]))
+            for i in range(len(points))
+        ])
+        points = torch.cat(points)
+        gt_bboxes = gt_bboxes.to(points.device)
+        n_points = len(points)
+        n_boxes = len(gt_bboxes)
+        volumes = gt_bboxes.volume.unsqueeze(0).expand(n_points, n_boxes)
+        # condition 1: point inside box
+        boxes = torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+                          dim=1)
+        boxes = boxes.expand(n_points, n_boxes, 7)
+        points = points.unsqueeze(1).expand(n_points, n_boxes, 3)
+        face_distances = self._get_face_distances(points, boxes)
+        inside_box_condition = face_distances.min(dim=-1).values > 0
+        # condition 2: positive points per level >= limit
+        # calculate positive points per scale
+        n_pos_points_per_level = []
+        for i in range(n_levels):
+            n_pos_points_per_level.append(
+                torch.sum(inside_box_condition[levels == i], dim=0))
+        # find best level
+        n_pos_points_per_level = torch.stack(n_pos_points_per_level, dim=0)
+        lower_limit_mask = n_pos_points_per_level < self.pts_assign_threshold
+        lower_index = torch.argmax(lower_limit_mask.int(), dim=0) - 1
+        lower_index = torch.where(lower_index < 0, 0, lower_index)
+        all_upper_limit_mask = torch.all(
+            torch.logical_not(lower_limit_mask), dim=0)
+        best_level = torch.where(all_upper_limit_mask, n_levels - 1,
+                                 lower_index)
+        # keep only points with best level
+        best_level = best_level.expand(n_points, n_boxes)
+        levels = torch.unsqueeze(levels, 1).expand(n_points, n_boxes)
+        level_condition = best_level == levels
+        # condition 3: limit topk points per box by centerness
+        centerness = self._get_centerness(face_distances)
+        centerness = torch.where(inside_box_condition, centerness,
+                                 torch.ones_like(centerness) * -1)
+        centerness = torch.where(level_condition, centerness,
+                                 torch.ones_like(centerness) * -1)
+        top_centerness = torch.topk(
+            centerness,
+            min(self.pts_center_threshold + 1, len(centerness)),
+            dim=0).values[-1]
+        topk_condition = centerness > top_centerness.unsqueeze(0)
+        # condition 4: min volume box per point
+        volumes = torch.where(inside_box_condition, volumes, float_max)
+        volumes = torch.where(level_condition, volumes, float_max)
+        volumes = torch.where(topk_condition, volumes, float_max)
+        min_volumes, min_inds = volumes.min(dim=1)
+        center_targets = centerness[torch.arange(n_points), min_inds]
+        bbox_targets = boxes[torch.arange(n_points), min_inds]
+        if not gt_bboxes.with_yaw:
+            bbox_targets = bbox_targets[:, :-1]
+        cls_targets = gt_labels[min_inds]
+        cls_targets = torch.where(min_volumes == float_max, -1, cls_targets)
+        return center_targets, bbox_targets, cls_targets
+    def _single_scene_multiclass_nms(self, bboxes: Tensor, scores: Tensor,
+                                     input_meta: dict) -> Tuple[Tensor, ...]:
+        """Multi-class nms for a single scene.
+        Args:
+            bboxes (Tensor): Predicted boxes of shape (N_boxes, 6) or
+                (N_boxes, 7).
+            scores (Tensor): Predicted scores of shape (N_boxes, N_classes).
+            input_meta (dict): Scene meta data.
+        Returns:
+            tuple[Tensor, ...]: Predicted bboxes, scores and labels.
+        """
+        num_classes = scores.shape[1]
+        with_yaw = bboxes.shape[1] == 7
+        nms_bboxes, nms_scores, nms_labels = [], [], []
+        for i in range(num_classes):
+            ids = scores[:, i] > self.test_cfg.score_thr
+            if not ids.any():
+                continue
+            class_scores = scores[ids, i]
+            class_bboxes = bboxes[ids]
+            if with_yaw:
+                nms_function = nms3d
+            else:
+                class_bboxes = torch.cat(
+                    (class_bboxes, torch.zeros_like(class_bboxes[:, :1])),
+                    dim=1)
+                nms_function = nms3d_normal
+            nms_ids = nms_function(class_bboxes, class_scores,
+                                   self.test_cfg.iou_thr)
+            nms_bboxes.append(class_bboxes[nms_ids])
+            nms_scores.append(class_scores[nms_ids])
+            nms_labels.append(
+                bboxes.new_full(
+                    class_scores[nms_ids].shape, i, dtype=torch.long))
+        if len(nms_bboxes):
+            nms_bboxes = torch.cat(nms_bboxes, dim=0)
+            nms_scores = torch.cat(nms_scores, dim=0)
+            nms_labels = torch.cat(nms_labels, dim=0)
+        else:
+            nms_bboxes = bboxes.new_zeros((0, bboxes.shape[1]))
+            nms_scores = bboxes.new_zeros((0, ))
+            nms_labels = bboxes.new_zeros((0, ))
+        if not with_yaw:
+            nms_bboxes = nms_bboxes[:, :6]
+        return nms_bboxes, nms_scores, nms_labels
--- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py
@@ -4,6 +4,7 @@ from typing import List, Optional, Sequence, Tuple
 import numpy as np
 import torch
 from mmcv.cnn import Scale
+from mmdet.models.utils import multi_apply, select_single_mlvl
 from mmengine.model import normal_init
 from mmengine.structures import InstanceData
 from torch import Tensor
@@ -14,7 +15,6 @@ from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures import limit_period, points_img2cam, xywhr2xyxyr
 from mmdet3d.utils import (ConfigType, InstanceList, OptConfigType,
                           OptInstanceList)
-from mmdet.models.utils import multi_apply, select_single_mlvl
 from .anchor_free_mono3d_head import AnchorFreeMono3DHead
 RangeType = Sequence[Tuple[int, int]]