[Refactor] Refactor Mono3D models

b496f579 · ZCMax · ChaimZhu · 35667791 · b496f579 · b496f579
Commit b496f579 authored Jul 18, 2022 by ZCMax Committed by ChaimZhu Jul 20, 2022
16 changed files
--- a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
 from abc import ABCMeta, abstractmethod
-from typing import List, Optional
+from typing import Optional, Tuple

 from mmcv.runner import BaseModule
 from mmengine.config import ConfigDict
 from torch import Tensor

-from mmdet3d.core import Det3DDataSample
+from mmdet3d.core.utils import InstanceList, OptMultiConfig, SampleList


 class BaseMono3DDenseHead(BaseModule, metaclass=ABCMeta):
-    """Base class for Monocular 3D DenseHeads."""
+    """Base class for Monocular 3D DenseHeads.

-    def __init__(self, init_cfg: Optional[dict] = None) -> None:
-        super(BaseMono3DDenseHead, self).__init__(init_cfg=init_cfg)
+    1. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.

-    @abstractmethod
-    def loss(self, **kwargs):
-        """Compute losses of the head."""
-        pass
+    .. code:: text

-    def get_bboxes(self, *args, **kwargs):
-        warnings.warn('`get_bboxes` is deprecated and will be removed in '
-                      'the future. Please use `get_results` instead.')
-        return self.get_results(*args, **kwargs)
+    loss(): forward() -> loss_by_feat()

-    @abstractmethod
-    def get_results(self, *args, **kwargs):
-        """Transform network outputs of a batch into 3D bbox results."""
-        pass
+    2. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()

-    def forward_train(self,
-                      x: List[Tensor],
-                      batch_data_samples: List[Det3DDataSample],
-                      proposal_cfg: Optional[ConfigDict] = None,
-                      **kwargs):
+    3. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super(BaseMono3DDenseHead, self).__init__(init_cfg=init_cfg)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
        """
        Args:
            x (list[Tensor]): Features from FPN.
            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
                contains the meta information of each image and corresponding
                annotations.
-            proposal_cfg (mmengine.Config, optional): Test / postprocessing
-                configuration, if None, test_cfg would be used.
-                Defaults to None.

        Returns:
            tuple or Tensor: When `proposal_cfg` is None, the detector is a \
@@ -73,26 +81,105 @@ class BaseMono3DDenseHead(BaseModule, metaclass=ABCMeta):

        outs = self(x)
        batch_gt_instances_3d = []
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_gt_instances,
+                              batch_img_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(self,
+                         x: Tuple[Tensor],
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[ConfigDict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each image and
+                corresponding annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+                results of each image after the post process.
+        """
+        batch_gt_instances_3d = []
+        batch_gt_instances = []
        batch_gt_instances_ignore = []
        batch_img_metas = []
        for data_sample in batch_data_samples:
            batch_img_metas.append(data_sample.metainfo)
            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
-            if 'ignored_instances' in data_sample:
-                batch_gt_instances_ignore.append(data_sample.ignored_instances)
-            else:
-                batch_gt_instances_ignore.append(None)
-
-        loss_inputs = outs + (batch_gt_instances_3d, batch_img_metas,
-                              batch_gt_instances_ignore)
-        losses = self.loss(*loss_inputs)
-
-        if proposal_cfg is None:
-            return losses
-        else:
-            batch_img_metas = [
-                data_sample.metainfo for data_sample in batch_data_samples
-            ]
-            results_list = self.get_results(
-                *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
-            return losses, results_list
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_gt_instances,
+                              batch_img_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
+
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        outs = self(x)
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
+
+    @abstractmethod
+    def predict_by_feat(self, **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results."""
+        pass
--- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py
--- a/mmdet3d/models/dense_heads/monoflex_head.py
+++ b/mmdet3d/models/dense_heads/monoflex_head.py
@@ -3,7 +3,6 @@ from typing import List, Optional, Tuple, Union

 import torch
 from mmcv.cnn import xavier_init
-from mmcv.runner import force_fp32
 from mmengine.config import ConfigDict
 from mmengine.data import InstanceData
 from torch import Tensor
@@ -197,39 +196,8 @@ class MonoFlexHead(AnchorFreeMono3DHead):
        if self.use_edge_fusion:
            self._init_edge_module()

-    def forward_train(self,
-                      x: List[Tensor],
-                      batch_data_samples: List[Det3DDataSample],
-                      proposal_cfg: Optional[ConfigDict] = None,
-                      **kwargs):
-        """
-        Args:
-            x (list[Tensor]): Features from FPN.
-            batch_img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
-                shape (num_gts, 4).
-            gt_labels (list[Tensor]): Ground truth labels of each box,
-                shape (num_gts,).
-            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
-                shape (num_gts, self.bbox_code_size).
-            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
-                shape (num_gts,).
-            centers_2d (list[Tensor]): Projected 3D center of each box,
-                shape (num_gts, 2).
-            depths (list[Tensor]): Depth of projected 3D center of each box,
-                shape (num_gts,).
-            attr_labels (list[Tensor]): Attribute labels of each box,
-                shape (num_gts,).
-            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
-                ignored, shape (num_ignored_gts, 4).
-            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used
-        Returns:
-            tuple:
-                losses: (dict[str, Tensor]): A dictionary of loss components.
-                proposal_list (list[Tensor]): Proposals of each image.
-        """
+    def loss(self, x: List[Tensor], batch_data_samples: List[Det3DDataSample],
+             **kwargs):
        """
        Args:
            x (list[Tensor]): Features from FPN.
@@ -266,15 +234,15 @@ class MonoFlexHead(AnchorFreeMono3DHead):
        """

        batch_gt_instances_3d = []
+        batch_gt_instances = []
        batch_gt_instances_ignore = []
        batch_img_metas = []
        for data_sample in batch_data_samples:
            batch_img_metas.append(data_sample.metainfo)
            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
-            if 'ignored_instances' in data_sample:
-                batch_gt_instances_ignore.append(data_sample.ignored_instances)
-            else:
-                batch_gt_instances_ignore.append(None)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))

        # monoflex head needs img_metas for feature extraction
        outs = self(x, batch_img_metas)
@@ -282,15 +250,7 @@ class MonoFlexHead(AnchorFreeMono3DHead):
                              batch_gt_instances_ignore)
        losses = self.loss(*loss_inputs)

-        if proposal_cfg is None:
-            return losses
-        else:
-            batch_img_metas = [
-                data_sample.metainfo for data_sample in batch_data_samples
-            ]
-            results_list = self.get_results(
-                *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
-            return losses, results_list
+        return losses

    def forward(self, feats: List[Tensor], batch_img_metas: List[dict]):
        """Forward features from the upstream network.
@@ -373,9 +333,8 @@ class MonoFlexHead(AnchorFreeMono3DHead):

        return cls_score, bbox_pred

-    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
-    def get_results(self, cls_scores: List[Tensor], bbox_preds: List[Tensor],
-                    batch_img_metas: List[dict]):
+    def predict_by_feat(self, cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor], batch_img_metas: List[dict]):
        """Generate bboxes from bbox head predictions.

        Args:
@@ -393,7 +352,7 @@ class MonoFlexHead(AnchorFreeMono3DHead):
            cls_scores[0].new_tensor(input_meta['cam2img'])
            for input_meta in batch_img_metas
        ])
-        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
+        batch_bboxes, batch_scores, batch_topk_labels = self._decode_heatmap(
            cls_scores[0],
            bbox_preds[0],
            batch_img_metas,
@@ -429,13 +388,13 @@ class MonoFlexHead(AnchorFreeMono3DHead):

        return result_list

-    def decode_heatmap(self,
-                       cls_score: Tensor,
-                       reg_pred: Tensor,
-                       batch_img_metas: List[dict],
-                       cam2imgs: Tensor,
-                       topk: int = 100,
-                       kernel: int = 3):
+    def _decode_heatmap(self,
+                        cls_score: Tensor,
+                        reg_pred: Tensor,
+                        batch_img_metas: List[dict],
+                        cam2imgs: Tensor,
+                        topk: int = 100,
+                        kernel: int = 3):
        """Transform outputs into detections raw bbox predictions.

        Args:
@@ -530,14 +489,16 @@ class MonoFlexHead(AnchorFreeMono3DHead):
        return preds

    def get_targets(self, batch_gt_instances_3d: List[InstanceData],
+                    batch_gt_instances: List[InstanceData],
                    feat_shape: Tuple[int], batch_img_metas: List[dict]):
        """Get training targets for batch images.
 ``
        Args:
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d`` and
-                attributes.
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
            feat_shape (tuple[int]): Feature map shape with value,
                shape (B, _, H, W).
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
@@ -574,10 +535,10 @@ class MonoFlexHead(AnchorFreeMono3DHead):
        """

        gt_bboxes_list = [
-            gt_instances_3d.bboxes for gt_instances_3d in batch_gt_instances_3d
+            gt_instances.bboxes for gt_instances in batch_gt_instances
        ]
        gt_labels_list = [
-            gt_instances_3d.labels for gt_instances_3d in batch_gt_instances_3d
+            gt_instances.labels for gt_instances in batch_gt_instances
        ]
        gt_bboxes_3d_list = [
            gt_instances_3d.bboxes_3d
@@ -721,12 +682,14 @@ class MonoFlexHead(AnchorFreeMono3DHead):

        return center_heatmap_target, avg_factor, target_labels

-    def loss(self,
-             cls_scores: List[Tensor],
-             bbox_preds: List[Tensor],
-             batch_gt_instances_3d: List[InstanceData],
-             batch_img_metas: List[dict],
-             batch_gt_instances_ignore: Optional[List[InstanceData]] = None):
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances_3d: List[InstanceData],
+            batch_gt_instances: List[InstanceData],
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: Optional[List[InstanceData]] = None):
        """Compute loss of the head.

        Args:
@@ -736,9 +699,10 @@ class MonoFlexHead(AnchorFreeMono3DHead):
                number is bbox_code_size.
                shape (B, 7, H, W).
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d`` and
-                attributes.
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
@@ -756,6 +720,7 @@ class MonoFlexHead(AnchorFreeMono3DHead):

        center2d_heatmap_target, avg_factor, target_labels = \
            self.get_targets(batch_gt_instances_3d,
+                             batch_gt_instances,
                             center2d_heatmap.shape,
                             batch_img_metas)


--- a/mmdet3d/models/dense_heads/pgd_head.py
+++ b/mmdet3d/models/dense_heads/pgd_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import List, Optional, Tuple

 import numpy as np
 import torch
 from mmcv.cnn import Scale, bias_init_with_prob, normal_init
-from mmcv.runner import force_fp32
 from mmengine.data import InstanceData
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F

 from mmdet3d.core import box3d_multiclass_nms, xywhr2xyxyr
 from mmdet3d.core.bbox import points_cam2img, points_img2cam
+from mmdet3d.core.utils import (ConfigType, InstanceList, OptConfigType,
+                                OptInstanceList)
 from mmdet3d.registry import MODELS
 from mmdet.core import distance2bbox, multi_apply
 from .fcos_mono3d_head import FCOSMono3DHead
@@ -86,7 +88,7 @@ class PGDHead(FCOSMono3DHead):
                     base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6),
                                (3.9, 1.56, 1.6)),
                     code_size=7),
-                 **kwargs):
+                 **kwargs) -> None:
        self.use_depth_classifier = use_depth_classifier
        self.use_onlyreg_proj = use_onlyreg_proj
        self.depth_branch = depth_branch
@@ -190,11 +192,11 @@ class PGDHead(FCOSMono3DHead):
            for conv_weight in self.conv_weights:
                normal_init(conv_weight, std=0.01)

-    def forward(self, feats):
+    def forward(self, x: Tuple[Tensor]) -> Tuple[Tensor, ...]:
        """Forward features from the upstream network.

        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
+            x (tuple[Tensor]): Features from the upstream network, each is
                a 4D-tensor.

        Returns:
@@ -220,10 +222,10 @@ class PGDHead(FCOSMono3DHead):
                centernesses (list[Tensor]): Centerness for each scale level,
                    each is a 4D-tensor, the channel number is num_points * 1.
        """
-        return multi_apply(self.forward_single, feats, self.scales,
-                           self.strides)
+        return multi_apply(self.forward_single, x, self.scales, self.strides)

-    def forward_single(self, x, scale, stride):
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, ...]:
        """Forward features of a single scale level.

        Args:
@@ -271,17 +273,17 @@ class PGDHead(FCOSMono3DHead):
            attr_pred, centerness

    def get_proj_bbox2d(self,
-                        bbox_preds,
-                        pos_dir_cls_preds,
-                        labels_3d,
-                        bbox_targets_3d,
-                        pos_points,
-                        pos_inds,
-                        batch_img_metas,
-                        pos_depth_cls_preds=None,
-                        pos_weights=None,
-                        pos_cls_scores=None,
-                        with_kpts=False):
+                        bbox_preds: List[Tensor],
+                        pos_dir_cls_preds: List[Tensor],
+                        labels_3d: List[Tensor],
+                        bbox_targets_3d: List[Tensor],
+                        pos_points: Tensor,
+                        pos_inds: Tensor,
+                        batch_img_metas: List[dict],
+                        pos_depth_cls_preds: Optional[Tensor] = None,
+                        pos_weights: Optional[Tensor] = None,
+                        pos_cls_scores: Optional[Tensor] = None,
+                        with_kpts: bool = False) -> Tuple[Tensor]:
        """Decode box predictions and get projected 2D attributes.

        Args:
@@ -448,9 +450,12 @@ class PGDHead(FCOSMono3DHead):

        return outputs

-    def get_pos_predictions(self, bbox_preds, dir_cls_preds, depth_cls_preds,
-                            weights, attr_preds, centernesses, pos_inds,
-                            batch_img_metas):
+    def get_pos_predictions(self, bbox_preds: List[Tensor],
+                            dir_cls_preds: List[Tensor],
+                            depth_cls_preds: List[Tensor],
+                            weights: List[Tensor], attr_preds: List[Tensor],
+                            centernesses: List[Tensor], pos_inds: Tensor,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
        """Flatten predictions and get positive ones.

        Args:
@@ -528,20 +533,19 @@ class PGDHead(FCOSMono3DHead):
        return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \
            pos_weights, pos_attr_preds, pos_centerness

-    @force_fp32(
-        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
-                  'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             depth_cls_preds,
-             weights,
-             attr_preds,
-             centernesses,
-             batch_gt_instances_3d,
-             batch_img_metas,
-             batch_gt_instances_ignore=None):
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            depth_cls_preds: List[Tensor],
+            weights: List[Tensor],
+            attr_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
        """Compute loss of the head.

        Args:
@@ -591,7 +595,7 @@ class PGDHead(FCOSMono3DHead):
                                           bbox_preds[0].device)
        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
            self.get_targets(
-                all_level_points, batch_gt_instances_3d)
+                all_level_points, batch_gt_instances_3d, batch_gt_instances)

        num_imgs = cls_scores[0].size(0)
        # flatten cls_scores and targets
@@ -785,20 +789,17 @@ class PGDHead(FCOSMono3DHead):

        return loss_dict

-    @force_fp32(
-        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
-                  'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
-    def get_results(self,
-                    cls_scores,
-                    bbox_preds,
-                    dir_cls_preds,
-                    depth_cls_preds,
-                    weights,
-                    attr_preds,
-                    centernesses,
-                    batch_img_metas,
-                    cfg=None,
-                    rescale=None):
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        depth_cls_preds: List[Tensor],
+                        weights: List[Tensor],
+                        attr_preds: List[Tensor],
+                        centernesses: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: OptConfigType = None,
+                        rescale: bool = False) -> InstanceList:
        """Transform network output for a batch into bbox predictions.

        Args:
@@ -824,7 +825,7 @@ class PGDHead(FCOSMono3DHead):
            cfg (mmcv.Config, optional): Test / postprocessing configuration,
                if None, test_cfg would be used. Defaults to None.
            rescale (bool, optional): If True, return boxes in original image
-                space. Defaults to None.
+                space. Defaults to False.

        Returns:
            list[tuple[Tensor]]: Each item in result_list is a tuple, which
@@ -898,25 +899,33 @@ class PGDHead(FCOSMono3DHead):
                centernesses[i][img_id].detach() for i in range(num_levels)
            ]
            img_meta = batch_img_metas[img_id]
-            results = self._get_results_single(
-                cls_score_list, bbox_pred_list, dir_cls_pred_list,
-                depth_cls_pred_list, weight_list, attr_pred_list,
-                centerness_pred_list, mlvl_points, img_meta, cfg, rescale)
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                dir_cls_pred_list=dir_cls_pred_list,
+                depth_cls_pred_list=depth_cls_pred_list,
+                weight_list=weight_list,
+                attr_pred_list=attr_pred_list,
+                centerness_pred_list=centerness_pred_list,
+                mlvl_points=mlvl_points,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale)
            result_list.append(results)
        return result_list

-    def _get_results_single(self,
-                            cls_scores,
-                            bbox_preds,
-                            dir_cls_preds,
-                            depth_cls_preds,
-                            weights,
-                            attr_preds,
-                            centernesses,
-                            mlvl_points,
-                            img_meta,
-                            cfg,
-                            rescale=False):
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                depth_cls_pred_list: List[Tensor],
+                                weight_list: List[Tensor],
+                                attr_pred_list: List[Tensor],
+                                centerness_pred_list: List[Tensor],
+                                mlvl_points: Tensor,
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
        """Transform outputs for a single batch item into bbox predictions.

        Args:
@@ -951,7 +960,7 @@ class PGDHead(FCOSMono3DHead):
        view = np.array(img_meta['cam2img'])
        scale_factor = img_meta['scale_factor']
        cfg = self.test_cfg if cfg is None else cfg
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_points)
        mlvl_centers2d = []
        mlvl_bboxes = []
        mlvl_scores = []
@@ -966,8 +975,9 @@ class PGDHead(FCOSMono3DHead):

        for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
                attr_pred, centerness, points in zip(
-                    cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds,
-                    weights, attr_preds, centernesses, mlvl_points):
+                    cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                    depth_cls_pred_list, weight_list, attr_pred_list,
+                    centerness_pred_list, mlvl_points):
            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
            scores = cls_score.permute(1, 2, 0).reshape(
                -1, self.cls_out_channels).sigmoid()
@@ -1018,9 +1028,9 @@ class PGDHead(FCOSMono3DHead):
            bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
            if rescale:
                bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor(
-                    scale_factor)
+                    scale_factor[0])
                if self.pred_bbox2d:
-                    bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor)
+                    bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor[0])
            if self.use_depth_classifier:
                prob_depth_pred = self.bbox_coder.decode_prob_depth(
                    depth_cls_pred, self.depth_range, self.depth_unit,
@@ -1106,13 +1116,21 @@ class PGDHead(FCOSMono3DHead):
            results.attr_labels = attrs

        if self.pred_bbox2d:
+            results_2d = InstanceData()
            bboxes2d = nms_results[-1]
-            bboxes2d = torch.cat([bboxes2d, scores[:, None]], dim=1)
-            results.bboxes = bboxes2d
-
-        return results
-
-    def get_targets(self, points, batch_gt_instances_3d):
+            results_2d.bboxes = bboxes2d
+            results_2d.scores = scores
+            results_2d.labels = labels
+            return results, results_2d
+        else:
+            return results
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        batch_gt_instances_3d: InstanceList,
+        batch_gt_instances: InstanceList,
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
        """Compute regression, classification and centerss targets for points
        in multiple images.

@@ -1120,9 +1138,10 @@ class PGDHead(FCOSMono3DHead):
            points (list[Tensor]): Points of each fpn level, each has shape
                (num_points, 2).
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d`` and
-                attributes.
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.

        Returns:
            tuple:
@@ -1146,14 +1165,17 @@ class PGDHead(FCOSMono3DHead):

        if 'attr_labels' not in batch_gt_instances_3d[0]:
            for gt_instances_3d in batch_gt_instances_3d:
-                gt_instances_3d.attr_labels = gt_instances_3d.labels.new_full(
-                    gt_instances_3d.labels.shape, self.attr_background_label)
+                gt_instances_3d.attr_labels = \
+                    gt_instances_3d.labels_3d.new_full(
+                        gt_instances_3d.labels_3d.shape,
+                        self.attr_background_label)

        # get labels and bbox_targets of each image
        _, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \
            centerness_targets_list, attr_targets_list = multi_apply(
                self._get_target_single,
                batch_gt_instances_3d,
+                batch_gt_instances,
                points=concat_points,
                regress_ranges=concat_regress_ranges,
                num_points_per_lvl=num_points)

--- a/mmdet3d/models/dense_heads/smoke_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/smoke_mono3d_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple

 import torch
-from mmcv.runner import force_fp32
-from mmengine.config import ConfigDict
 from mmengine.data import InstanceData
 from torch import Tensor
 from torch.nn import functional as F

+from mmdet3d.core.utils import (ConfigType, InstanceList, OptConfigType,
+                                OptInstanceList, OptMultiConfig)
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet.core import multi_apply
 from mmdet.models.utils import gaussian_radius, gen_gaussian_target
@@ -35,19 +35,20 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
            regression heatmap channels.
        ori_channel (list[int]): indices of orientation offset pred in
            regression heatmap channels.
-        bbox_coder (dict): Bbox coder for encoding and decoding boxes.
-        loss_cls (dict, optional): Config of classification loss.
+        bbox_coder (:obj:`ConfigDict` or dict): Bbox coder for encoding
+            and decoding boxes.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
-        loss_bbox (dict, optional): Config of localization loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
            Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0).
-        loss_dir (dict, optional): Config of direction classification loss.
-            In SMOKE, Default: None.
-        loss_attr (dict, optional): Config of attribute classification loss.
-            In SMOKE, Default: None.
-        loss_centerness (dict): Config of centerness loss.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
+        loss_dir (:obj:`ConfigDict` or dict, Optional): Config of direction
+            classification loss. In SMOKE, Default: None.
+        loss_attr (:obj:`ConfigDict` or dict, Optional): Config of attribute
+            classification loss. In SMOKE, Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and config norm layer.
            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
-        init_cfg (dict): Initialization config dict. Default: None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
    """  # noqa: E501

    def __init__(self,
@@ -55,15 +56,16 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
                 in_channels: int,
                 dim_channel: List[int],
                 ori_channel: List[int],
-                 bbox_coder: dict,
-                 loss_cls: dict = dict(
-                     type='GaussionFocalLoss', loss_weight=1.0),
-                 loss_bbox: dict = dict(type='L1Loss', loss_weight=0.1),
-                 loss_dir: Optional[dict] = None,
-                 loss_attr: Optional[dict] = None,
-                 norm_cfg: dict = dict(
+                 bbox_coder: ConfigType,
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.GaussionFocalLoss', loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_dir: OptConfigType = None,
+                 loss_attr: OptConfigType = None,
+                 norm_cfg: OptConfigType = dict(
                     type='GN', num_groups=32, requires_grad=True),
-                 init_cfg: Optional[Union[ConfigDict, dict]] = None,
+                 init_cfg: OptMultiConfig = None,
                 **kwargs) -> None:
        super().__init__(
            num_classes,
@@ -79,11 +81,11 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
        self.ori_channel = ori_channel
        self.bbox_coder = TASK_UTILS.build(bbox_coder)

-    def forward(self, feats: Tuple[Tensor]):
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
        """Forward features from the upstream network.

        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
+            x (tuple[Tensor]): Features from the upstream network, each is
                a 4D-tensor.

        Returns:
@@ -95,9 +97,9 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
                    level, each is a 4D-tensor, the channel number is
                    num_points * bbox_code_size.
        """
-        return multi_apply(self.forward_single, feats)
+        return multi_apply(self.forward_single, x)

-    def forward_single(self, x: Tensor) -> Union[Tensor, Tensor]:
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
        """Forward features of a single scale level.

        Args:
@@ -118,12 +120,11 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
        bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori)
        return cls_score, bbox_pred

-    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
-    def get_results(self,
-                    cls_scores,
-                    bbox_preds,
-                    batch_img_metas,
-                    rescale=None):
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = None) -> InstanceList:
        """Generate bboxes from bbox head predictions.

        Args:
@@ -134,8 +135,16 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
            rescale (bool): If True, return boxes in original image space.

        Returns:
-            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
-                Each item in result_list is 4-tuple.
+            list[:obj:`InstanceData`]: 3D Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, 7).
        """
        assert len(cls_scores) == len(bbox_preds) == 1
        cam2imgs = torch.stack([
@@ -146,7 +155,7 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
            cls_scores[0].new_tensor(img_meta['trans_mat'])
            for img_meta in batch_img_metas
        ])
-        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
+        batch_bboxes, batch_scores, batch_topk_labels = self._decode_heatmap(
            cls_scores[0],
            bbox_preds[0],
            batch_img_metas,
@@ -183,14 +192,14 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):

        return result_list

-    def decode_heatmap(self,
-                       cls_score,
-                       reg_pred,
-                       batch_img_metas,
-                       cam2imgs,
-                       trans_mats,
-                       topk=100,
-                       kernel=3):
+    def _decode_heatmap(self,
+                        cls_score: Tensor,
+                        reg_pred: Tensor,
+                        batch_img_metas: List[dict],
+                        cam2imgs: Tensor,
+                        trans_mats: Tensor,
+                        topk: int = 100,
+                        kernel: int = 3) -> Tuple[Tensor, Tensor, Tensor]:
        """Transform outputs into detections raw bbox predictions.

        Args:
@@ -212,6 +221,7 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
        Returns:
            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
               the following Tensors:
+
              - batch_bboxes (Tensor): Coords of each 3D box.
                    shape (B, k, 7)
              - batch_scores (Tensor): Scores of each 3D box.
@@ -241,9 +251,10 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
        batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size)
        return batch_bboxes, batch_scores, batch_topk_labels

-    def get_predictions(self, labels_3d, centers_2d, gt_locations,
-                        gt_dimensions, gt_orientations, indices,
-                        batch_img_metas, pred_reg):
+    def get_predictions(self, labels_3d: Tensor, centers_2d: Tensor,
+                        gt_locations: Tensor, gt_dimensions: Tensor,
+                        gt_orientations: Tensor, indices: Tensor,
+                        batch_img_metas: List[dict], pred_reg: Tensor) -> dict:
        """Prepare predictions for computing loss.

        Args:
@@ -266,6 +277,7 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):

        Returns:
            dict: the dict has components below:
+
            - bbox3d_yaws (:obj:`CameraInstance3DBoxes`):
                bbox calculated using pred orientations.
            - bbox3d_dims (:obj:`CameraInstance3DBoxes`):
@@ -312,22 +324,26 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):

        return pred_bboxes

-    def get_targets(self, batch_gt_instances_3d, feat_shape, batch_img_metas):
+    def get_targets(self, batch_gt_instances_3d: InstanceList,
+                    batch_gt_instances: InstanceList, feat_shape: Tuple[int],
+                    batch_img_metas: List[dict]) -> Tuple[Tensor, int, dict]:
        """Get training targets for batch images.

        Args:
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d`` and
-                attributes.
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
            feat_shape (tuple[int]): Feature map shape with value,
                shape (B, _, H, W).
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.

        Returns:
-            tuple[Tensor, dict]: The Tensor value is the targets of
+            tuple[Tensor, int, dict]: The Tensor value is the targets of
                center heatmap, the dict has components below:
+
              - gt_centers_2d (Tensor): Coords of each projected 3D box
                    center on image. shape (B * max_objs, 2)
              - gt_labels_3d (Tensor): Labels of each 3D box.
@@ -347,10 +363,10 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
        """

        gt_bboxes = [
-            gt_instances_3d.bboxes for gt_instances_3d in batch_gt_instances_3d
+            gt_instances.bboxes for gt_instances in batch_gt_instances
        ]
        gt_labels = [
-            gt_instances_3d.labels for gt_instances_3d in batch_gt_instances_3d
+            gt_instances.labels for gt_instances in batch_gt_instances
        ]
        gt_bboxes_3d = [
            gt_instances_3d.bboxes_3d
@@ -459,12 +475,14 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):

        return center_heatmap_target, avg_factor, target_labels

-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             batch_gt_instances_3d,
-             batch_img_metas,
-             batch_gt_instances_ignore=None):
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
        """Compute loss of the head.

        Args:
@@ -474,9 +492,10 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
                number is bbox_code_size.
                shape (B, 7, H, W).
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instance_3d.  It usually includes ``bboxes``、``labels``
-                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d`` and
-                attributes.
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
            batch_img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
@@ -485,15 +504,19 @@ class SMOKEMono3DHead(AnchorFreeMono3DHead):
                Defaults to None.

        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
+            dict[str, Tensor]: A dictionary of loss components, which has
+                components below:
+
+                - loss_cls (Tensor): loss of cls heatmap.
+                - loss_bbox (Tensor): loss of bbox heatmap.
        """
        assert len(cls_scores) == len(bbox_preds) == 1
-        assert batch_gt_instances_ignore is None
        center_2d_heatmap = cls_scores[0]
        pred_reg = bbox_preds[0]

        center_2d_heatmap_target, avg_factor, target_labels = \
            self.get_targets(batch_gt_instances_3d,
+                             batch_gt_instances,
                             center_2d_heatmap.shape,
                             batch_img_metas)


--- a/mmdet3d/models/detectors/fcos_mono3d.py
+++ b/mmdet3d/models/detectors/fcos_mono3d.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.core import ConfigType, OptConfigType, OptMultiConfig
 from mmdet3d.registry import MODELS
 from .single_stage_mono3d import SingleStageMono3DDetector

@@ -9,14 +10,36 @@ class FCOSMono3D(SingleStageMono3DDetector):

    Currently please refer to our entry on the
    `leaderboard <https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
    """  # noqa: E501

    def __init__(self,
-                 backbone,
-                 neck,
-                 bbox_head,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(FCOSMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
-                                         test_cfg, pretrained)
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
--- a/mmdet3d/models/detectors/single_stage_mono3d.py
+++ b/mmdet3d/models/detectors/single_stage_mono3d.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
+from typing import Tuple

-import mmcv
-import numpy as np
-import torch
-from mmcv.parallel import DataContainer as DC
+from torch import Tensor

-from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result,
-                          show_multi_modality_result)
+from mmdet3d.core import Det3DDataSample, InstanceList
+from mmdet3d.core.utils import SampleList
 from mmdet3d.registry import MODELS
 from mmdet.models.detectors.single_stage import SingleStageDetector

@@ -16,212 +13,61 @@ from mmdet.models.detectors.single_stage import SingleStageDetector
 class SingleStageMono3DDetector(SingleStageDetector):
    """Base class for monocular 3D single-stage detectors.

-    Single-stage detectors directly and densely predict bounding boxes on the
-    output features of the backbone+neck.
+    Monocular 3D single-stage detectors directly and densely predict bounding
+    boxes on the output features of the backbone+neck.
    """

-    def extract_feats(self, imgs):
-        """Directly extract features from the backbone+neck."""
-        assert isinstance(imgs, list)
-        return [self.extract_feat(img) for img in imgs]
-
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      centers2d,
-                      depths,
-                      attr_labels=None,
-                      gt_bboxes_ignore=None):
-        """
+    def convert_to_datasample(self, results_list: InstanceList) -> SampleList:
+        """ Convert results list to `Det3DDataSample`.
        Args:
-            img (Tensor): Input images of shape (N, C, H, W).
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): A List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                :class:`mmdet.datasets.pipelines.Collect`.
-            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
-                image in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): Class indices corresponding to each box
-            gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for
-                each image in [x, y, z, x_size, y_size, z_size, yaw, vx, vy]
-                format.
-            gt_labels_3d (list[Tensor]): 3D class indices corresponding to
-                each box.
-            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
-            depths (list[Tensor]): Depth of projected centers on 2D images.
-            attr_labels (list[Tensor], optional): Attribute indices
-                corresponding to each box
-            gt_bboxes_ignore (list[Tensor]): Specify which bounding
-                boxes can be ignored when computing the loss.
+            results_list (list[:obj:`InstanceData`]):Detection results
+            of each image. For each image, it could contains two results
+            format:
+                1. pred_instances_3d
+                2. (pred_instances_3d, pred_instances)

        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        x = self.extract_feat(img)
-        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
-                                              gt_labels, gt_bboxes_3d,
-                                              gt_labels_3d, centers2d, depths,
-                                              attr_labels, gt_bboxes_ignore)
-        return losses
+            list[:obj:`Det3DDataSample`]: 3D Detection results of the
+            input images. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, C) where C >=7.
+            """
+        out_results_list = []
+        for i in range(len(results_list)):
+            result = Det3DDataSample()
+            if len(results_list[i]) == 2:
+                result.pred_instances_3d = results_list[i][0]
+                result.pred_instances = results_list[i][1]
+            else:
+                result.pred_instances_3d = results_list[i]
+            out_results_list.append(result)
+        return out_results_list

-    def simple_test(self, img, img_metas, rescale=False):
-        """Test function without test time augmentation.
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features.

        Args:
-            imgs (list[torch.Tensor]): List of multiple images
-            img_metas (list[dict]): List of image information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
+            batch_inputs_dict (dict): Contains 'img' key
+                with image tensor with shape (N, C, H ,W).

        Returns:
-            list[list[np.ndarray]]: BBox results of each image and classes.
-                The outer list corresponds to each image. The inner list
-                corresponds to each class.
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
        """
-        x = self.extract_feat(img)
-        outs = self.bbox_head(x)
-        bbox_outputs = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
-
-        if self.bbox_head.pred_bbox2d:
-            from mmdet.core import bbox2result
-            bbox2d_img = [
-                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
-                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
-            ]
-            bbox_outputs = [bbox_outputs[0][:-1]]
-
-        bbox_img = [
-            bbox3d2result(bboxes, scores, labels, attrs)
-            for bboxes, scores, labels, attrs in bbox_outputs
-        ]
-
-        bbox_list = [dict() for i in range(len(img_metas))]
-        for result_dict, img_bbox in zip(bbox_list, bbox_img):
-            result_dict['img_bbox'] = img_bbox
-        if self.bbox_head.pred_bbox2d:
-            for result_dict, img_bbox2d in zip(bbox_list, bbox2d_img):
-                result_dict['img_bbox2d'] = img_bbox2d
-        return bbox_list
+        batch_imgs = batch_inputs_dict['imgs']
+        x = self.backbone(batch_imgs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x

+    # TODO: Support test time augmentation
    def aug_test(self, imgs, img_metas, rescale=False):
        """Test function with test time augmentation."""
-        feats = self.extract_feats(imgs)
-
-        # only support aug_test for one sample
-        outs_list = [self.bbox_head(x) for x in feats]
-        for i, img_meta in enumerate(img_metas):
-            if img_meta[0]['pcd_horizontal_flip']:
-                for j in range(len(outs_list[i])):  # for each prediction
-                    if outs_list[i][j][0] is None:
-                        continue
-                    for k in range(len(outs_list[i][j])):
-                        # every stride of featmap
-                        outs_list[i][j][k] = torch.flip(
-                            outs_list[i][j][k], dims=[3])
-                reg = outs_list[i][1]
-                for reg_feat in reg:
-                    # offset_x
-                    reg_feat[:, 0, :, :] = 1 - reg_feat[:, 0, :, :]
-                    # velo_x
-                    if self.bbox_head.pred_velo:
-                        reg_feat[:, 7, :, :] = -reg_feat[:, 7, :, :]
-                    # rotation
-                    reg_feat[:, 6, :, :] = -reg_feat[:, 6, :, :] + np.pi
-
-        merged_outs = []
-        for i in range(len(outs_list[0])):  # for each prediction
-            merged_feats = []
-            for j in range(len(outs_list[0][i])):
-                if outs_list[0][i][0] is None:
-                    merged_feats.append(None)
-                    continue
-                # for each stride of featmap
-                avg_feats = torch.mean(
-                    torch.cat([x[i][j] for x in outs_list]),
-                    dim=0,
-                    keepdim=True)
-                if i == 1:  # regression predictions
-                    # rot/velo/2d det keeps the original
-                    avg_feats[:, 6:, :, :] = \
-                        outs_list[0][i][j][:, 6:, :, :]
-                if i == 2:
-                    # dir_cls keeps the original
-                    avg_feats = outs_list[0][i][j]
-                merged_feats.append(avg_feats)
-            merged_outs.append(merged_feats)
-        merged_outs = tuple(merged_outs)
-
-        bbox_outputs = self.bbox_head.get_bboxes(
-            *merged_outs, img_metas[0], rescale=rescale)
-        if self.bbox_head.pred_bbox2d:
-            from mmdet.core import bbox2result
-            bbox2d_img = [
-                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
-                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
-            ]
-            bbox_outputs = [bbox_outputs[0][:-1]]
-
-        bbox_img = [
-            bbox3d2result(bboxes, scores, labels, attrs)
-            for bboxes, scores, labels, attrs in bbox_outputs
-        ]
-
-        bbox_list = dict()
-        bbox_list.update(img_bbox=bbox_img[0])
-        if self.bbox_head.pred_bbox2d:
-            bbox_list.update(img_bbox2d=bbox2d_img[0])
-
-        return [bbox_list]
-
-    def show_results(self, data, result, out_dir, show=False, score_thr=None):
-        """Results visualization.
-
-        Args:
-            data (list[dict]): Input images and the information of the sample.
-            result (list[dict]): Prediction results.
-            out_dir (str): Output directory of visualization result.
-            show (bool, optional): Determines whether you are
-                going to show result by open3d.
-                Defaults to False.
-            TODO: implement score_thr of single_stage_mono3d.
-            score_thr (float, optional): Score threshold of bounding boxes.
-                Default to None.
-                Not implemented yet, but it is here for unification.
-        """
-        for batch_id in range(len(result)):
-            if isinstance(data['img_metas'][0], DC):
-                img_filename = data['img_metas'][0]._data[0][batch_id][
-                    'filename']
-                cam2img = data['img_metas'][0]._data[0][batch_id]['cam2img']
-            elif mmcv.is_list_of(data['img_metas'][0], dict):
-                img_filename = data['img_metas'][0][batch_id]['filename']
-                cam2img = data['img_metas'][0][batch_id]['cam2img']
-            else:
-                ValueError(
-                    f"Unsupported data type {type(data['img_metas'][0])} "
-                    f'for visualization!')
-            img = mmcv.imread(img_filename)
-            file_name = osp.split(img_filename)[-1].split('.')[0]
-
-            assert out_dir is not None, 'Expect out_dir, got none.'
-
-            pred_bboxes = result[batch_id]['img_bbox']['boxes_3d']
-            assert isinstance(pred_bboxes, CameraInstance3DBoxes), \
-                f'unsupported predicted bbox type {type(pred_bboxes)}'
-
-            show_multi_modality_result(
-                img,
-                None,
-                pred_bboxes,
-                cam2img,
-                out_dir,
-                file_name,
-                'camera',
-                show=show)
+        pass
--- a/mmdet3d/models/detectors/smoke_mono3d.py
+++ b/mmdet3d/models/detectors/smoke_mono3d.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.core import ConfigType, OptConfigType, OptMultiConfig
 from mmdet3d.registry import MODELS
 from .single_stage_mono3d import SingleStageMono3DDetector

@@ -8,14 +9,35 @@ class SMOKEMono3D(SingleStageMono3DDetector):
    r"""SMOKE <https://arxiv.org/abs/2002.10111>`_ for monocular 3D object
        detection.

+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
    """

    def __init__(self,
-                 backbone,
-                 neck,
-                 bbox_head,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(SMOKEMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
-                                          test_cfg, pretrained)
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
--- a/tests/data/kitti/kitti_infos_train.pkl
+++ b/tests/data/kitti/kitti_infos_train.pkl
--- a/tests/test_data/test_datasets/test_kitti_dataset.py
+++ b/tests/test_data/test_datasets/test_kitti_dataset.py
@@ -85,6 +85,10 @@ def test_getitem():
    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
    assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
                          torch.tensor(7.2650))
+    assert 'centers_2d' in ann_info
+    assert ann_info['centers_2d'].dtype == np.float64
+    assert 'depths' in ann_info
+    assert ann_info['depths'].dtype == np.float64
    assert 'group_id' in ann_info
    assert ann_info['group_id'].dtype == np.int64
    assert 'occluded' in ann_info

--- a/tests/test_data/test_datasets/test_nuscenes_dataset.py
+++ b/tests/test_data/test_datasets/test_nuscenes_dataset.py
@@ -45,8 +45,8 @@ def test_getitem():
        _generate_nus_dataset_config()

    nus_dataset = NuScenesDataset(
-        data_root,
-        ann_file,
+        data_root=data_root,
+        ann_file=ann_file,
        data_prefix=data_prefix,
        pipeline=pipeline,
        metainfo=dict(CLASSES=classes),

--- a/tests/test_data/test_transforms/utils.py
+++ b/tests/test_data/test_transforms/utils.py
@@ -4,6 +4,7 @@ import numpy as np
 from mmdet3d.core import LiDARInstance3DBoxes
 # create a dummy `results` to test the pipeline
 from mmdet3d.datasets import LoadAnnotations3D, LoadPointsFromFile
+from mmdet3d.datasets.pipelines.loading import LoadImageFromFileMono3D


 def create_dummy_data_info(with_ann=True):
@@ -20,6 +21,10 @@ def create_dummy_data_info(with_ann=True):
                  -1.5808]])),
        'gt_labels_3d':
        np.array([1]),
+        'centers_2d':
+        np.array([[765.04, 214.56]]),
+        'depths':
+        np.array([8.410]),
        'num_lidar_pts':
        np.array([377]),
        'difficulty':
@@ -134,6 +139,9 @@ def create_dummy_data_info(with_ann=True):
            ],
            'bbox_label_3d':
            -1,
+            'center_2d': [765.04, 214.56],
+            'depth':
+            8.410,
            'num_lidar_pts':
            377,
            'difficulty':
@@ -168,3 +176,17 @@ def create_data_info_after_loading():
    data_info = load_points_transform(data_info)
    data_info_after_loading = load_anns_transform(data_info)
    return data_info_after_loading
+
+
+def create_mono3d_data_info_after_loading():
+    load_anns_transform = LoadAnnotations3D(
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True)
+    load_img_transform = LoadImageFromFileMono3D()
+    data_info = create_dummy_data_info()
+    data_info = load_img_transform(data_info)
+    data_info_after_loading = load_anns_transform(data_info)
+    return data_info_after_loading
--- a/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py
+++ b/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py
@@ -117,6 +117,7 @@ class TestFCOSMono3DHead(TestCase):
        # When truth is non-empty then all losses
        # should be nonzero for random inputs
        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()

        gt_bboxes = torch.rand([3, 4], dtype=torch.float32)
        gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([3, 9]), box_dim=9)
@@ -129,14 +130,14 @@ class TestFCOSMono3DHead(TestCase):

        gt_instances_3d.bboxes_3d = gt_bboxes_3d
        gt_instances_3d.labels_3d = gt_labels_3d
-        gt_instances_3d.bboxes = gt_bboxes
-        gt_instances_3d.labels = gt_labels
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
        gt_instances_3d.centers_2d = centers_2d
        gt_instances_3d.depths = depths
        gt_instances_3d.attr_labels = attr_labels

-        gt_losses = fcos_mono3d_head.loss(*ret_dict, [gt_instances_3d],
-                                          img_metas)
+        gt_losses = fcos_mono3d_head.loss_by_feat(*ret_dict, [gt_instances_3d],
+                                                  [gt_instances], img_metas)

        gt_cls_loss = gt_losses['loss_cls'].item()
        gt_siz_loss = gt_losses['loss_size'].item()
@@ -160,7 +161,7 @@ class TestFCOSMono3DHead(TestCase):
        self.assertGreater(gt_atr_loss, 0, 'attribue loss should be positive')

        # test get_results
-        results_list = fcos_mono3d_head.get_results(*ret_dict, img_metas)
+        results_list = fcos_mono3d_head.predict_by_feat(*ret_dict, img_metas)
        self.assertEqual(
            len(results_list), 1,
            'there should be no centerness loss when there are no true boxes')

--- a/tests/test_models/test_dense_heads/test_pgd_head.py
+++ b/tests/test_models/test_dense_heads/test_pgd_head.py
@@ -142,6 +142,7 @@ class TestFGDHead(TestCase):
        # When truth is non-empty then all losses
        # should be nonzero for random inputs
        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()

        gt_bboxes = torch.rand([3, 4], dtype=torch.float32)
        gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7)
@@ -152,12 +153,13 @@ class TestFGDHead(TestCase):

        gt_instances_3d.bboxes_3d = gt_bboxes_3d
        gt_instances_3d.labels_3d = gt_labels_3d
-        gt_instances_3d.bboxes = gt_bboxes
-        gt_instances_3d.labels = gt_labels
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
        gt_instances_3d.centers_2d = centers_2d
        gt_instances_3d.depths = depths

-        gt_losses = pgd_head.loss(*ret_dict, [gt_instances_3d], img_metas)
+        gt_losses = pgd_head.loss_by_feat(*ret_dict, [gt_instances_3d],
+                                          [gt_instances], img_metas)

        gt_cls_loss = gt_losses['loss_cls'].item()
        gt_siz_loss = gt_losses['loss_size'].item()
@@ -184,15 +186,15 @@ class TestFGDHead(TestCase):
                           'consistency loss should be positive')

        # test get_results
-        results_list = pgd_head.get_results(*ret_dict, img_metas)
+        results_list = pgd_head.predict_by_feat(*ret_dict, img_metas)
        self.assertEqual(
            len(results_list), 1,
            'there should be no centerness loss when there are no true boxes')
-        results = results_list[0]
+        results, results_2d = results_list[0]
        pred_bboxes_3d = results.bboxes_3d
        pred_scores_3d = results.scores_3d
        pred_labels_3d = results.labels_3d
-        pred_bboxes_2d = results.bboxes
+        pred_bboxes_2d = results_2d.bboxes
        self.assertEqual(pred_bboxes_3d.tensor.shape, torch.Size([20, 7]),
                         'the shape of predicted 3d bboxes should be [20, 7]')
        self.assertEqual(
@@ -202,6 +204,6 @@ class TestFGDHead(TestCase):
            pred_labels_3d.shape, torch.Size([20]),
            'the shape of predicted 3d bbox labels should be [20]')
        self.assertEqual(
-            pred_bboxes_2d.shape, torch.Size([20, 5]),
-            'the shape of predicted 2d bbox attribute labels should be [20, 5]'
+            pred_bboxes_2d.shape, torch.Size([20, 4]),
+            'the shape of predicted 2d bbox attribute labels should be [20, 4]'
        )
--- a/tests/test_models/test_dense_heads/test_smoke_mono3d_head.py
+++ b/tests/test_models/test_dense_heads/test_smoke_mono3d_head.py
@@ -82,6 +82,7 @@ class TestSMOKEMono3DHead(TestCase):
        # When truth is non-empty then all losses
        # should be nonzero for random inputs
        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()

        gt_bboxes = torch.Tensor([[1.0, 2.0, 20.0, 40.0],
                                  [45.0, 50.0, 80.0, 70.1],
@@ -94,13 +95,14 @@ class TestSMOKEMono3DHead(TestCase):

        gt_instances_3d.bboxes_3d = gt_bboxes_3d
        gt_instances_3d.labels_3d = gt_labels_3d
-        gt_instances_3d.bboxes = gt_bboxes
-        gt_instances_3d.labels = gt_labels
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
        gt_instances_3d.centers_2d = centers_2d
        gt_instances_3d.depths = depths

-        gt_losses = smoke_mono3d_head.loss(*ret_dict, [gt_instances_3d],
-                                           img_metas)
+        gt_losses = smoke_mono3d_head.loss_by_feat(*ret_dict,
+                                                   [gt_instances_3d],
+                                                   [gt_instances], img_metas)

        gt_cls_loss = gt_losses['loss_cls'].item()
        gt_box_loss = gt_losses['loss_bbox'].item()
@@ -109,7 +111,7 @@ class TestSMOKEMono3DHead(TestCase):
        self.assertGreater(gt_box_loss, 0, 'bbox loss should be positive')

        # test get_results
-        results_list = smoke_mono3d_head.get_results(*ret_dict, img_metas)
+        results_list = smoke_mono3d_head.predict_by_feat(*ret_dict, img_metas)
        self.assertEqual(
            len(results_list), 1, 'there should be one image results')
        results = results_list[0]

--- a/tools/data_converter/update_infos_to_v2.py
+++ b/tools/data_converter/update_infos_to_v2.py
@@ -14,7 +14,10 @@ from os import path as osp

 import mmcv
 import numpy as np
+from nuscenes.nuscenes import NuScenes

+from mmdet3d.core.bbox import points_cam2img
+from mmdet3d.datasets.convert_utils import get_2d_boxes
 from mmdet3d.datasets.utils import convert_quaternion_to_matrix


@@ -60,6 +63,19 @@ def get_empty_instance():
    return instance


+def get_empty_multicamera_instances():
+
+    cam_instance = dict(
+        CAM_FONT=None,
+        CAM_FRONT_RIGHT=None,
+        CAM_FRONT_LEFT=None,
+        CAM_BACK=None,
+        CAM_BACK_RIGHT=None,
+        CAM_BACK_LEFT=None)
+
+    return cam_instance
+
+
 def get_empty_lidar_points():
    lidar_points = dict(
        # (int, optional) : Number of features for each point.
@@ -206,6 +222,32 @@ def clear_data_info_unused_keys(data_info):
    return data_info, empty_flag


+def generate_camera_instances(info, nusc):
+
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+
+    empty_multicamera_instance = get_empty_multicamera_instances()
+
+    for cam in camera_types:
+        cam_info = info['cams'][cam]
+        # list[dict]
+        ann_infos = get_2d_boxes(
+            nusc,
+            cam_info['sample_data_token'],
+            visibilities=['', '1', '2', '3', '4'])
+        empty_multicamera_instance[cam] = ann_infos
+
+    return empty_multicamera_instance
+
+
 def update_nuscenes_infos(pkl_path, out_dir):
    print(f'{pkl_path} will be modified.')
    if out_dir in pkl_path:
@@ -222,6 +264,11 @@ def update_nuscenes_infos(pkl_path, out_dir):
        'version':
        data_list['metadata']['version']
    }
+    nusc = NuScenes(
+        version=data_list['metadata']['version'],
+        dataroot='./data/nuscenes',
+        verbose=True)
+
    print('Start updating:')
    converted_list = []
    for i, ori_info_dict in enumerate(
@@ -304,6 +351,8 @@ def update_nuscenes_infos(pkl_path, out_dir):
            empty_instance['bbox_3d_isvalid'] = ori_info_dict['valid_flag'][i]
            empty_instance = clear_instance_unused_keys(empty_instance)
            temp_data_info['instances'].append(empty_instance)
+        temp_data_info['cam_instances'] = generate_camera_instances(
+            ori_info_dict, nusc)
        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
        converted_list.append(temp_data_info)
    pkl_name = pkl_path.split('/')[-1]
@@ -313,7 +362,6 @@ def update_nuscenes_infos(pkl_path, out_dir):
    converted_data_info = dict(metainfo=METAINFO, data_list=converted_list)

    mmcv.dump(converted_data_info, out_path, 'pkl')
-    return temp_lidar_sweep


 def update_kitti_infos(pkl_path, out_dir):
@@ -382,6 +430,7 @@ def update_kitti_infos(pkl_path, out_dir):

        anns = ori_info_dict['annos']
        num_instances = len(anns['name'])
+        cam2img = ori_info_dict['calib']['P2']

        ignore_class_name = set()
        instance_list = []
@@ -401,6 +450,17 @@ def update_kitti_infos(pkl_path, out_dir):
            loc = anns['location'][instance_id]
            dims = anns['dimensions'][instance_id]
            rots = anns['rotation_y'][:, None][instance_id]
+
+            dst = np.array([0.5, 0.5, 0.5])
+            src = np.array([0.5, 1.0, 0.5])
+
+            center_3d = loc + dims * (dst - src)
+            center_2d = points_cam2img(
+                center_3d.reshape([1, 3]), cam2img, with_depth=True)
+            center_2d = center_2d.squeeze().tolist()
+            empty_instance['center_2d'] = center_2d[:2]
+            empty_instance['depth'] = center_2d[2]
+
            gt_bboxes_3d = np.concatenate([loc, dims, rots]).tolist()
            empty_instance['bbox_3d'] = gt_bboxes_3d
            empty_instance['bbox_label_3d'] = copy.deepcopy(
@@ -734,7 +794,6 @@ def parse_args():
        type=str,
        default='./data/kitti/kitti_infos_train.pkl ',
        help='specify the root dir of dataset')
-
    parser.add_argument(
        '--out-dir',
        type=str,