Release v1.1.0rc1

Release v1.1.0rc1

Release v1.1.0rc1
6c03a971 · Tai-Wang · GitHub · 9611c2d0 · ca42c312 · 6c03a971
Unverified Commit 6c03a971 authored Oct 14, 2022 by Tai-Wang Committed by GitHub Oct 14, 2022
20 changed files
--- a/mmdet3d/models/detectors/multiview_dfm.py
+++ b/mmdet3d/models/detectors/multiview_dfm.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample,
+                                                              voxel_sample)
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d.utils import get_lidar2img
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType
+from mmdet.models.detectors import BaseDetector
+from .dfm import DfM
+from .imvoxelnet import ImVoxelNet
+@MODELS.register_module()
+class MultiViewDfM(ImVoxelNet, DfM):
+    r"""Waymo challenge solution of `MV-FCOS3D++
+    <https://arxiv.org/abs/2207.12716>`_.
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
+        config.
+        backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        voxel_size (:obj:`ConfigDict` or dict): The voxel size.
+        anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
+            config.
+        neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
+            for 2D object detection. Defaults to None.
+        bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
+            head config for 2D object detection. Defaults to None.
+        depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
+            head config for depth estimation in fov space. Defaults to None.
+        depth_head (:obj:`ConfigDict` or dict, optional): The depth head
+            config for depth estimation in 3D voxel projected to fov space .
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        valid_sample (bool): Whether to filter invalid points in view
+            transformation. Defaults to True.
+        temporal_aggregate (str): Key to determine the aggregation way in
+            temporal fusion. Defaults to 'concat'.
+        transform_depth (bool): Key to determine the transformation of depth.
+            Defaults to True.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 backbone_stereo: ConfigType,
+                 backbone_3d: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head: ConfigType,
+                 voxel_size: ConfigType,
+                 anchor_generator: ConfigType,
+                 neck_2d: ConfigType = None,
+                 bbox_head_2d: ConfigType = None,
+                 depth_head_2d: ConfigType = None,
+                 depth_head: ConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 valid_sample: bool = True,
+                 temporal_aggregate: str = 'concat',
+                 transform_depth: bool = True,
+                 init_cfg: OptConfigType = None):
+        # TODO merge with DFM
+        BaseDetector.__init__(
+            self, data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        if backbone_stereo is not None:
+            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
+            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
+            self.backbone_stereo = MODELS.build(backbone_stereo)
+            assert self.neck.cat_img_feature == \
+                self.backbone_stereo.cat_img_feature
+            assert self.neck.sem_channels[
+                -1] == self.backbone_stereo.in_sem_channels
+        if backbone_3d is not None:
+            self.backbone_3d = MODELS.build(backbone_3d)
+        if neck_3d is not None:
+            self.neck_3d = MODELS.build(neck_3d)
+        if neck_2d is not None:
+            self.neck_2d = MODELS.build(neck_2d)
+        if bbox_head_2d is not None:
+            self.bbox_head_2d = MODELS.build(bbox_head_2d)
+        if depth_head_2d is not None:
+            self.depth_head_2d = MODELS.build(depth_head_2d)
+        if depth_head is not None:
+            self.depth_head = MODELS.build(depth_head)
+            self.depth_samples = self.depth_head.depth_samples
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.voxel_size = voxel_size
+        self.voxel_range = anchor_generator['ranges'][0]
+        self.n_voxels = [
+            round((self.voxel_range[3] - self.voxel_range[0]) /
+                  self.voxel_size[0]),
+            round((self.voxel_range[4] - self.voxel_range[1]) /
+                  self.voxel_size[1]),
+            round((self.voxel_range[5] - self.voxel_range[2]) /
+                  self.voxel_size[2])
+        ]
+        self.anchor_generator = TASK_UTILS.build(anchor_generator)
+        self.valid_sample = valid_sample
+        self.temporal_aggregate = temporal_aggregate
+        self.transform_depth = transform_depth
+    def extract_feat(self, batch_inputs_dict: dict,
+                     batch_data_samples: SampleList):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+        Returns:
+            torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
+        """
+        # TODO: Nt means the number of frames temporally
+        # num_views means the number of views of a frame
+        img = batch_inputs_dict['imgs']
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_size, _, C_in, H, W = img.shape
+        num_views = batch_img_metas[0]['num_views']
+        num_ref_frames = batch_img_metas[0]['num_ref_frames']
+        if num_ref_frames > 0:
+            num_frames = num_ref_frames + 1
+        else:
+            num_frames = 1
+        input_shape = img.shape[-2:]
+        # NOTE: input_shape is the largest pad_shape of the batch of images
+        for img_meta in batch_img_metas:
+            img_meta.update(input_shape=input_shape)
+        if num_ref_frames > 0:
+            cur_imgs = img[:, :num_views].reshape(-1, C_in, H, W)
+            prev_imgs = img[:, num_views:].reshape(-1, C_in, H, W)
+            cur_feats = self.backbone(cur_imgs)
+            cur_feats = self.neck(cur_feats)[0]
+            with torch.no_grad():
+                prev_feats = self.backbone(prev_imgs)
+                prev_feats = self.neck(prev_feats)[0]
+            _, C_feat, H_feat, W_feat = cur_feats.shape
+            cur_feats = cur_feats.view(batch_size, -1, C_feat, H_feat, W_feat)
+            prev_feats = prev_feats.view(batch_size, -1, C_feat, H_feat,
+                                         W_feat)
+            batch_feats = torch.cat([cur_feats, prev_feats], dim=1)
+        else:
+            batch_imgs = img.view(-1, C_in, H, W)
+            batch_feats = self.backbone(batch_imgs)
+            # TODO: support SPP module neck
+            batch_feats = self.neck(batch_feats)[0]
+            _, C_feat, H_feat, W_feat = batch_feats.shape
+            batch_feats = batch_feats.view(batch_size, -1, C_feat, H_feat,
+                                           W_feat)
+        # transform the feature to voxel & stereo space
+        transform_feats = self.feature_transformation(batch_feats,
+                                                      batch_img_metas,
+                                                      num_views, num_frames)
+        if self.with_depth_head_2d:
+            transform_feats += (batch_feats[:, :num_views], )
+        return transform_feats
+    def feature_transformation(self, batch_feats, batch_img_metas, num_views,
+                               num_frames):
+        """Feature transformation from perspective view to BEV.
+        Args:
+            batch_feats (torch.Tensor): Perspective view features of shape
+                (batch_size, num_views, C, H, W).
+            batch_img_metas (list[dict]): Image meta information. Each element
+                corresponds to a group of images. len(img_metas) == B.
+            num_views (int): Number of views.
+            num_frames (int): Number of consecutive frames.
+        Returns:
+            tuple[torch.Tensor]: Volume features and (optionally) stereo \
+            features.
+        """
+        # TODO: support more complicated 2D feature sampling
+        points = self.anchor_generator.grid_anchors(
+            [self.n_voxels[::-1]], device=batch_feats.device)[0][:, :3]
+        volumes = []
+        img_scale_factors = []
+        img_flips = []
+        img_crop_offsets = []
+        for feature, img_meta in zip(batch_feats, batch_img_metas):
+            # TODO: remove feature sampling from back
+            # TODO: support different scale_factors/flip/crop_offset for
+            # different views
+            frame_volume = []
+            frame_valid_nums = []
+            for frame_idx in range(num_frames):
+                volume = []
+                valid_flags = []
+                if isinstance(img_meta['img_shape'], list):
+                    img_shape = img_meta['img_shape'][frame_idx][:2]
+                else:
+                    img_shape = img_meta['img_shape'][:2]
+                for view_idx in range(num_views):
+                    sample_idx = frame_idx * num_views + view_idx
+                    if 'scale_factor' in img_meta:
+                        img_scale_factor = img_meta['scale_factor'][sample_idx]
+                        if isinstance(img_scale_factor, np.ndarray) and \
+                                len(img_meta['scale_factor']) >= 2:
+                            img_scale_factor = (
+                                points.new_tensor(img_scale_factor[:2]))
+                        else:
+                            img_scale_factor = (
+                                points.new_tensor(img_scale_factor))
+                    else:
+                        img_scale_factor = (1)
+                    img_flip = img_meta['flip'][sample_idx] \
+                        if 'flip' in img_meta.keys() else False
+                    img_crop_offset = (
+                        points.new_tensor(
+                            img_meta['img_crop_offset'][sample_idx])
+                        if 'img_crop_offset' in img_meta.keys() else 0)
+                    lidar2cam = points.new_tensor(
+                        img_meta['lidar2cam'][sample_idx])
+                    cam2img = points.new_tensor(
+                        img_meta['ori_cam2img'][sample_idx])
+                    # align the precision, the tensor is converted to float32
+                    lidar2img = get_lidar2img(cam2img.double(),
+                                              lidar2cam.double())
+                    lidar2img = lidar2img.float()
+                    sample_results = point_sample(
+                        img_meta,
+                        img_features=feature[sample_idx][None, ...],
+                        points=points,
+                        proj_mat=lidar2img,
+                        coord_type='LIDAR',
+                        img_scale_factor=img_scale_factor,
+                        img_crop_offset=img_crop_offset,
+                        img_flip=img_flip,
+                        img_pad_shape=img_meta['input_shape'],
+                        img_shape=img_shape,
+                        aligned=False,
+                        valid_flag=self.valid_sample)
+                    if self.valid_sample:
+                        volume.append(sample_results[0])
+                        valid_flags.append(sample_results[1])
+                    else:
+                        volume.append(sample_results)
+                    # TODO: save valid flags, more reasonable feat fusion
+                if self.valid_sample:
+                    valid_nums = torch.stack(
+                        valid_flags, dim=0).sum(0)  # (N, )
+                    volume = torch.stack(volume, dim=0).sum(0)
+                    valid_mask = valid_nums > 0
+                    volume[~valid_mask] = 0
+                    frame_valid_nums.append(valid_nums)
+                else:
+                    volume = torch.stack(volume, dim=0).mean(0)
+                frame_volume.append(volume)
+            img_scale_factors.append(img_scale_factor)
+            img_flips.append(img_flip)
+            img_crop_offsets.append(img_crop_offset)
+            if self.valid_sample:
+                if self.temporal_aggregate == 'mean':
+                    frame_volume = torch.stack(frame_volume, dim=0).sum(0)
+                    frame_valid_nums = torch.stack(
+                        frame_valid_nums, dim=0).sum(0)
+                    frame_valid_mask = frame_valid_nums > 0
+                    frame_volume[~frame_valid_mask] = 0
+                    frame_volume = frame_volume / torch.clamp(
+                        frame_valid_nums[:, None], min=1)
+                elif self.temporal_aggregate == 'concat':
+                    frame_valid_nums = torch.stack(frame_valid_nums, dim=1)
+                    frame_volume = torch.stack(frame_volume, dim=1)
+                    frame_valid_mask = frame_valid_nums > 0
+                    frame_volume[~frame_valid_mask] = 0
+                    frame_volume = (frame_volume / torch.clamp(
+                        frame_valid_nums[:, :, None], min=1)).flatten(
+                            start_dim=1, end_dim=2)
+            else:
+                frame_volume = torch.stack(frame_volume, dim=0).mean(0)
+            volumes.append(
+                frame_volume.reshape(self.n_voxels[::-1] + [-1]).permute(
+                    3, 2, 1, 0))
+        volume_feat = torch.stack(volumes)  # (B, C, N_x, N_y, N_z)
+        if self.with_backbone_3d:
+            outputs = self.backbone_3d(volume_feat)
+            volume_feat = outputs[0]
+            if self.backbone_3d.output_bev:
+                # use outputs[0] if len(outputs) == 1
+                # use outputs[1] if len(outputs) == 2
+                # TODO: unify the output formats
+                bev_feat = outputs[-1]
+        # grid_sample stereo features from the volume feature
+        # TODO: also support temporal modeling for depth head
+        if self.with_depth_head:
+            batch_stereo_feats = []
+            for batch_idx in range(volume_feat.shape[0]):
+                stereo_feat = []
+                for view_idx in range(num_views):
+                    img_scale_factor = img_scale_factors[batch_idx] \
+                        if self.transform_depth else points.new_tensor(
+                            [1., 1.])
+                    img_crop_offset = img_crop_offsets[batch_idx] \
+                        if self.transform_depth else points.new_tensor(
+                            [0., 0.])
+                    img_flip = img_flips[batch_idx] if self.transform_depth \
+                        else False
+                    img_pad_shape = img_meta['input_shape'] \
+                        if self.transform_depth else img_meta['ori_shape'][:2]
+                    lidar2cam = points.new_tensor(
+                        batch_img_metas[batch_idx]['lidar2cam'][view_idx])
+                    cam2img = points.new_tensor(
+                        img_meta[batch_idx]['lidar2cam'][view_idx])
+                    proj_mat = torch.matmul(cam2img, lidar2cam)
+                    stereo_feat.append(
+                        voxel_sample(
+                            volume_feat[batch_idx][None],
+                            voxel_range=self.voxel_range,
+                            voxel_size=self.voxel_size,
+                            depth_samples=volume_feat.new_tensor(
+                                self.depth_samples),
+                            proj_mat=proj_mat,
+                            downsample_factor=self.depth_head.
+                            downsample_factor,
+                            img_scale_factor=img_scale_factor,
+                            img_crop_offset=img_crop_offset,
+                            img_flip=img_flip,
+                            img_pad_shape=img_pad_shape,
+                            img_shape=batch_img_metas[batch_idx]['img_shape']
+                            [view_idx][:2],
+                            aligned=True))  # TODO: study the aligned setting
+                batch_stereo_feats.append(torch.cat(stereo_feat))
+            # cat (N, C, D, H, W) -> (B*N, C, D, H, W)
+            batch_stereo_feats = torch.cat(batch_stereo_feats)
+        if self.with_neck_3d:
+            if self.with_backbone_3d and self.backbone_3d.output_bev:
+                spatial_features = self.neck_3d(bev_feat)
+                # TODO: unify the outputs of neck_3d
+                volume_feat = spatial_features[1]
+            else:
+                volume_feat = self.neck_3d(volume_feat)[0]
+        # TODO: unify the output format of neck_3d
+        transform_feats = (volume_feat, )
+        if self.with_depth_head:
+            transform_feats += (batch_stereo_feats, )
+        return transform_feats
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
--- a/mmdet3d/models/detectors/mvx_two_stage.py
+++ b/mmdet3d/models/detectors/mvx_two_stage.py
@@ -401,7 +401,7 @@ class MVXTwoStageDetector(Base3DDetector):
        else:
            results_list_2d = None
-        detsamples = self.convert_to_datasample(batch_data_samples,
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
-                                                results_list_3d,
+                                                 results_list_3d,
-                                                results_list_2d)
+                                                 results_list_2d)
        return detsamples
--- a/mmdet3d/models/detectors/point_rcnn.py
+++ b/mmdet3d/models/detectors/point_rcnn.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
 import torch
 from mmdet3d.registry import MODELS
@@ -23,14 +25,14 @@ class PointRCNN(TwoStage3DDetector):
    """
    def __init__(self,
-                 backbone,
+                 backbone: dict,
-                 neck=None,
+                 neck: Optional[dict] = None,
-                 rpn_head=None,
+                 rpn_head: Optional[dict] = None,
-                 roi_head=None,
+                 roi_head: Optional[dict] = None,
-                 train_cfg=None,
+                 train_cfg: Optional[dict] = None,
-                 test_cfg=None,
+                 test_cfg: Optional[dict] = None,
-                 pretrained=None,
+                 init_cfg: Optional[dict] = None,
-                 init_cfg=None):
+                 data_preprocessor: Optional[dict] = None) -> Optional:
        super(PointRCNN, self).__init__(
            backbone=backbone,
            neck=neck,
@@ -38,111 +40,28 @@ class PointRCNN(TwoStage3DDetector):
            roi_head=roi_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            pretrained=pretrained,
+            init_cfg=init_cfg,
-            init_cfg=init_cfg)
+            data_preprocessor=data_preprocessor)
-    def extract_feat(self, points):
+    def extract_feat(self, batch_inputs_dict: Dict) -> Dict:
        """Directly extract features from the backbone+neck.
        Args:
-            points (torch.Tensor): Input points.
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
        Returns:
-            dict: Features from the backbone+neck
+            dict: Features from the backbone+neck and raw points.
        """
+        points = torch.stack(batch_inputs_dict['points'])
        x = self.backbone(points)
        if self.with_neck:
            x = self.neck(x)
-        return x
+        return dict(
+            fp_features=x['fp_features'].clone(),
-    def forward_train(self, points, input_metas, gt_bboxes_3d, gt_labels_3d):
+            fp_points=x['fp_xyz'].clone(),
-        """Forward of training.
+            raw_points=points)
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            input_metas (list[dict]): Meta information of each sample.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
-        Returns:
-            dict: Losses.
-        """
-        losses = dict()
-        stack_points = torch.stack(points)
-        x = self.extract_feat(stack_points)
-        # features for rcnn
-        backbone_feats = x['fp_features'].clone()
-        backbone_xyz = x['fp_xyz'].clone()
-        rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
-        bbox_preds, cls_preds = self.rpn_head(x)
-        rpn_loss = self.rpn_head.loss(
-            bbox_preds=bbox_preds,
-            cls_preds=cls_preds,
-            points=points,
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            input_metas=input_metas)
-        losses.update(rpn_loss)
-        bbox_list = self.rpn_head.get_bboxes(stack_points, bbox_preds,
-                                             cls_preds, input_metas)
-        proposal_list = [
-            dict(
-                boxes_3d=bboxes,
-                scores_3d=scores,
-                labels_3d=labels,
-                cls_preds=preds_cls)
-            for bboxes, scores, labels, preds_cls in bbox_list
-        ]
-        rcnn_feats.update({'points_cls_preds': cls_preds})
-        roi_losses = self.roi_head.forward_train(rcnn_feats, input_metas,
-                                                 proposal_list, gt_bboxes_3d,
-                                                 gt_labels_3d)
-        losses.update(roi_losses)
-        return losses
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Forward of testing.
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-            img_metas (list[dict]): Image metas.
-            imgs (list[torch.Tensor], optional): Images of each sample.
-                Defaults to None.
-            rescale (bool, optional): Whether to rescale results.
-                Defaults to False.
-        Returns:
-            list: Predicted 3d boxes.
-        """
-        stack_points = torch.stack(points)
-        x = self.extract_feat(stack_points)
-        # features for rcnn
-        backbone_feats = x['fp_features'].clone()
-        backbone_xyz = x['fp_xyz'].clone()
-        rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
-        bbox_preds, cls_preds = self.rpn_head(x)
-        rcnn_feats.update({'points_cls_preds': cls_preds})
-        bbox_list = self.rpn_head.get_bboxes(
-            stack_points, bbox_preds, cls_preds, img_metas, rescale=rescale)
-        proposal_list = [
-            dict(
-                boxes_3d=bboxes,
-                scores_3d=scores,
-                labels_3d=labels,
-                cls_preds=preds_cls)
-            for bboxes, scores, labels, preds_cls in bbox_list
-        ]
-        bbox_results = self.roi_head.simple_test(rcnn_feats, img_metas,
-                                                 proposal_list)
-        return bbox_results
--- a/mmdet3d/models/detectors/single_stage.py
+++ b/mmdet3d/models/detectors/single_stage.py
@@ -108,8 +108,8 @@ class SingleStage3DDetector(Base3DDetector):
        """
        x = self.extract_feat(batch_inputs_dict)
        results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
-        predictions = self.convert_to_datasample(batch_data_samples,
+        predictions = self.add_pred_to_datasample(batch_data_samples,
-                                                 results_list)
+                                                  results_list)
        return predictions
    def _forward(self,

--- a/mmdet3d/models/detectors/single_stage_mono3d.py
+++ b/mmdet3d/models/detectors/single_stage_mono3d.py
@@ -18,7 +18,7 @@ class SingleStageMono3DDetector(SingleStageDetector):
    boxes on the output features of the backbone+neck.
    """
-    def convert_to_datasample(
+    def add_pred_to_datasample(
        self,
        data_samples: SampleList,
        data_instances_3d: OptInstanceList = None,

--- a/mmdet3d/models/detectors/two_stage.py
+++ b/mmdet3d/models/detectors/two_stage.py
@@ -100,8 +100,9 @@ class TwoStage3DDetector(Base3DDetector):
            keys = rpn_losses.keys()
            for key in keys:
                if 'loss' in key and 'rpn' not in key:
-                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+                    losses[f'rpn_{key}'] = rpn_losses[key]
-            losses.update(rpn_losses)
+                else:
+                    losses[key] = rpn_losses[key]
        else:
            # TODO: Not support currently, should have a check at Fast R-CNN
            assert batch_data_samples[0].get('proposals', None) is not None
@@ -161,8 +162,8 @@ class TwoStage3DDetector(Base3DDetector):
                                             batch_data_samples)
        # connvert to Det3DDataSample
-        results_list = self.convert_to_datasample(batch_data_samples,
+        results_list = self.add_pred_to_datasample(batch_data_samples,
-                                                  results_list)
+                                                   results_list)
        return results_list

--- a/mmdet3d/models/detectors/votenet.py
+++ b/mmdet3d/models/detectors/votenet.py
@@ -99,8 +99,8 @@ class VoteNet(SingleStage3DDetector):
        points = batch_inputs_dict['points']
        results_list = self.bbox_head.predict(points, feats_dict,
                                              batch_data_samples, **kwargs)
-        data_3d_samples = self.convert_to_datasample(batch_data_samples,
+        data_3d_samples = self.add_pred_to_datasample(batch_data_samples,
-                                                     results_list)
+                                                      results_list)
        return data_3d_samples
    def aug_test(self, aug_inputs_list: List[dict],
@@ -143,6 +143,6 @@ class VoteNet(SingleStage3DDetector):
                                                  self.bbox_head.test_cfg)
        merged_results = InstanceData(**merged_results_dict)
-        data_3d_samples = self.convert_to_datasample(batch_data_samples,
+        data_3d_samples = self.add_pred_to_datasample(batch_data_samples,
-                                                     [merged_results])
+                                                      [merged_results])
        return data_3d_samples
--- a/mmdet3d/models/layers/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/layers/fusion_layers/point_fusion.py
@@ -7,7 +7,7 @@ from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type,
-                                        points_cam2img)
+                                        points_cam2img, points_img2cam)
 from . import apply_3d_transformation
@@ -23,7 +23,8 @@ def point_sample(img_meta,
                 img_shape,
                 aligned=True,
                 padding_mode='zeros',
-                 align_corners=True):
+                 align_corners=True,
+                 valid_flag=False):
    """Obtain image features using points.
    Args:
@@ -41,12 +42,15 @@ def point_sample(img_meta,
            padding, this is necessary to obtain features in feature map.
        img_shape (tuple[int]): int tuple indicates the h & w before padding
            after scaling, this is necessary for flipping coordinates.
-        aligned (bool, optional): Whether use bilinear interpolation when
+        aligned (bool): Whether use bilinear interpolation when
            sampling image features for each point. Defaults to True.
-        padding_mode (str, optional): Padding mode when padding values for
+        padding_mode (str): Padding mode when padding values for
            features of out-of-image points. Defaults to 'zeros'.
-        align_corners (bool, optional): Whether to align corners when
+        align_corners (bool): Whether to align corners when
            sampling image features for each point. Defaults to True.
+        valid_flag (bool): Whether to filter out the points that
+            outside the image and with depth smaller than 0. Defaults to
+            False.
    Returns:
        torch.Tensor: NxC image features sampled by point coordinates.
@@ -56,8 +60,13 @@ def point_sample(img_meta,
    points = apply_3d_transformation(
        points, coord_type, img_meta, reverse=True)
-    # project points to camera coordinate
+    # project points to image coordinate
-    pts_2d = points_cam2img(points, proj_mat)
+    if valid_flag:
+        proj_pts = points_cam2img(points, proj_mat, with_depth=True)
+        pts_2d = proj_pts[..., :2]
+        depths = proj_pts[..., 2]
+    else:
+        pts_2d = points_cam2img(points, proj_mat)
    # img transformation: scale -> crop -> flip
    # the image is resized by img_scale_factor
@@ -70,13 +79,13 @@ def point_sample(img_meta,
    if img_flip:
        # by default we take it as horizontal flip
        # use img_shape before padding for flip
-        orig_h, orig_w = img_shape
+        ori_h, ori_w = img_shape
-        coor_x = orig_w - coor_x
+        coor_x = ori_w - coor_x
    h, w = img_pad_shape
-    coor_y = coor_y / h * 2 - 1
+    norm_coor_y = coor_y / h * 2 - 1
-    coor_x = coor_x / w * 2 - 1
+    norm_coor_x = coor_x / w * 2 - 1
-    grid = torch.cat([coor_x, coor_y],
+    grid = torch.cat([norm_coor_x, norm_coor_y],
                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
    # align_corner=True provides higher performance
@@ -88,6 +97,15 @@ def point_sample(img_meta,
        padding_mode=padding_mode,
        align_corners=align_corners)  # 1xCx1xN feats
+    if valid_flag:
+        # (N, )
+        valid = (coor_x.squeeze() < w) & (coor_x.squeeze() > 0) & (
+            coor_y.squeeze() < h) & (coor_y.squeeze() > 0) & (
+                depths > 0)
+        valid_features = point_features.squeeze().t()
+        valid_features[~valid] = 0
+        return valid_features, valid  # (N, C), (N,)
    return point_features.squeeze().t()
@@ -304,3 +322,94 @@ class PointFusion(BaseModule):
            align_corners=self.align_corners,
        )
        return img_pts
+def voxel_sample(voxel_features,
+                 voxel_range,
+                 voxel_size,
+                 depth_samples,
+                 proj_mat,
+                 downsample_factor,
+                 img_scale_factor,
+                 img_crop_offset,
+                 img_flip,
+                 img_pad_shape,
+                 img_shape,
+                 aligned=True,
+                 padding_mode='zeros',
+                 align_corners=True):
+    """Obtain image features using points.
+    Args:
+        voxel_features (torch.Tensor): 1 x C x Nx x Ny x Nz voxel features.
+        voxel_range (list): The range of voxel features.
+        voxel_size (:obj:`ConfigDict` or dict): The voxel size of voxel
+            features.
+        depth_samples (torch.Tensor): N depth samples in LiDAR coordinates.
+        proj_mat (torch.Tensor): ORIGINAL LiDAR2img projection matrix
+            for N views.
+        downsample_factor (int): The downsample factor in rescaling.
+        img_scale_factor (tuple[torch.Tensor]): Scale factor with shape of
+            (w_scale, h_scale).
+        img_crop_offset (tuple[torch.Tensor]): Crop offset used to crop
+            image during data augmentation with shape of (w_offset, h_offset).
+        img_flip (bool): Whether the image is flipped.
+        img_pad_shape (tuple[int]): int tuple indicates the h & w after
+            padding, this is necessary to obtain features in feature map.
+        img_shape (tuple[int]): int tuple indicates the h & w before padding
+            after scaling, this is necessary for flipping coordinates.
+        aligned (bool, optional): Whether use bilinear interpolation when
+            sampling image features for each point. Defaults to True.
+        padding_mode (str, optional): Padding mode when padding values for
+            features of out-of-image points. Defaults to 'zeros'.
+        align_corners (bool, optional): Whether to align corners when
+            sampling image features for each point. Defaults to True.
+    Returns:
+        torch.Tensor: 1xCxDxHxW frustum features sampled from voxel features.
+    """
+    # construct frustum grid
+    device = voxel_features.device
+    h, w = img_pad_shape
+    h_out = round(h / downsample_factor)
+    w_out = round(w / downsample_factor)
+    ws = (torch.linspace(0, w_out - 1, w_out) * downsample_factor).to(device)
+    hs = (torch.linspace(0, h_out - 1, h_out) * downsample_factor).to(device)
+    depths = depth_samples[::downsample_factor]
+    num_depths = len(depths)
+    ds_3d, ys_3d, xs_3d = torch.meshgrid(depths, hs, ws)
+    # grid: (D, H_out, W_out, 3) -> (D*H_out*W_out, 3)
+    grid = torch.stack([xs_3d, ys_3d, ds_3d], dim=-1).view(-1, 3)
+    # recover the coordinates in the canonical space
+    # reverse order of augmentations: flip -> crop -> scale
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        ori_h, ori_w = img_shape
+        grid[:, 0] = ori_w - grid[:, 0]
+    grid[:, :2] += img_crop_offset
+    grid[:, :2] /= img_scale_factor
+    # grid3d: (D*H_out*W_out, 3) in LiDAR coordinate system
+    grid3d = points_img2cam(grid, proj_mat)
+    # convert the 3D point coordinates to voxel coordinates
+    voxel_range = torch.tensor(voxel_range).to(device).view(1, 6)
+    voxel_size = torch.tensor(voxel_size).to(device).view(1, 3)
+    # suppose the voxel grid is generated with AlignedAnchorGenerator
+    # -0.5 given each grid is located at the center of the grid
+    # TODO: study whether here needs -0.5
+    grid3d = (grid3d - voxel_range[:, :3]) / voxel_size - 0.5
+    grid_size = (voxel_range[:, 3:] - voxel_range[:, :3]) / voxel_size
+    # normalize grid3d to (-1, 1)
+    grid3d = grid3d / grid_size * 2 - 1
+    # (x, y, z) -> (z, y, x) for grid_sampling
+    grid3d = grid3d.view(1, num_depths, h_out, w_out, 3)[..., [2, 1, 0]]
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    frustum_features = F.grid_sample(
+        voxel_features,
+        grid3d,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCxDxHxW feats
+    return frustum_features
--- a/mmdet3d/models/layers/spconv/__init__.py
+++ b/mmdet3d/models/layers/spconv/__init__.py
@@ -6,7 +6,9 @@ try:
 except ImportError:
    IS_SPCONV2_AVAILABLE = False
 else:
-    if hasattr(spconv, '__version__') and spconv.__version__ >= '2.0.0':
+    if hasattr(spconv,
+               '__version__') and spconv.__version__ >= '2.0.0' and hasattr(
+                   spconv, 'pytorch'):
        IS_SPCONV2_AVAILABLE = register_spconv2()
    else:
        IS_SPCONV2_AVAILABLE = False

--- a/mmdet3d/models/layers/transformer.py
+++ b/mmdet3d/models/layers/transformer.py
@@ -6,7 +6,7 @@ from torch import nn as nn
 @MODELS.register_module()
 class GroupFree3DMHA(MultiheadAttention):
-    """A warpper for torch.nn.MultiheadAttention for GroupFree3D.
+    """A wrapper for torch.nn.MultiheadAttention for GroupFree3D.
    This module implements MultiheadAttention with identity connection,
    and positional encoding used in DETR is also passed as input.

--- a/mmdet3d/models/middle_encoders/sparse_encoder.py
+++ b/mmdet3d/models/middle_encoders/sparse_encoder.py
@@ -278,7 +278,7 @@ class SparseEncoderSASSD(SparseEncoder):
        Returns:
            dict: Backbone features.
            tuple[torch.Tensor]: Mean feature value of the points,
-                Classificaion result of the points,
+                Classification result of the points,
                Regression offsets of the points.
        """
        coors = coors.int()
@@ -409,7 +409,7 @@ class SparseEncoderSASSD(SparseEncoder):
        Args:
            points (torch.Tensor): Mean feature value of the points.
-            point_cls (torch.Tensor): Classificaion result of the points.
+            point_cls (torch.Tensor): Classification result of the points.
            point_reg (torch.Tensor): Regression offsets of the points.
            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth
                boxes for each sample.

--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
+from typing import Dict, List, Tuple
 import numpy as np
 import torch
@@ -10,6 +10,7 @@ from torch import Tensor
 from mmdet3d.models import make_sparse_convmodule
 from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+from mmdet3d.utils.typing import InstanceList
 from mmdet.models.utils import multi_apply
 if IS_SPCONV2_AVAILABLE:
@@ -21,11 +22,11 @@ else:
 from mmengine.model import BaseModule
 from torch import nn as nn
-from mmdet3d.models.builder import build_loss
 from mmdet3d.models.layers import nms_bev, nms_normal_bev
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
                                        rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.utils.typing import SamplingResultList
 @MODELS.register_module()
@@ -56,40 +57,40 @@ class PartA2BboxHead(BaseModule):
        conv_cfg (dict): Config dict of convolutional layers
        norm_cfg (dict): Config dict of normalization layers
        loss_bbox (dict): Config dict of box regression loss.
-        loss_cls (dict): Config dict of classifacation loss.
+        loss_cls (dict, optional): Config dict of classifacation loss.
    """
    def __init__(self,
-                 num_classes,
+                 num_classes: int,
-                 seg_in_channels,
+                 seg_in_channels: int,
-                 part_in_channels,
+                 part_in_channels: int,
-                 seg_conv_channels=None,
+                 seg_conv_channels: List[int] = None,
-                 part_conv_channels=None,
+                 part_conv_channels: List[int] = None,
-                 merge_conv_channels=None,
+                 merge_conv_channels: List[int] = None,
-                 down_conv_channels=None,
+                 down_conv_channels: List[int] = None,
-                 shared_fc_channels=None,
+                 shared_fc_channels: List[int] = None,
-                 cls_channels=None,
+                 cls_channels: List[int] = None,
-                 reg_channels=None,
+                 reg_channels: List[int] = None,
-                 dropout_ratio=0.1,
+                 dropout_ratio: float = 0.1,
-                 roi_feat_size=14,
+                 roi_feat_size: int = 14,
-                 with_corner_loss=True,
+                 with_corner_loss: bool = True,
-                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
-                 conv_cfg=dict(type='Conv1d'),
+                 conv_cfg: dict = dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 loss_bbox=dict(
+                 loss_bbox: dict = dict(
                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-                 loss_cls=dict(
+                 loss_cls: dict = dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=True,
                     reduction='none',
                     loss_weight=1.0),
-                 init_cfg=None):
+                 init_cfg: dict = None) -> None:
        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
        self.num_classes = num_classes
        self.with_corner_loss = with_corner_loss
        self.bbox_coder = TASK_UTILS.build(bbox_coder)
-        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_bbox = MODELS.build(loss_bbox)
-        self.loss_cls = build_loss(loss_cls)
+        self.loss_cls = MODELS.build(loss_cls)
        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
        assert down_conv_channels[-1] == shared_fc_channels[0]
@@ -244,7 +245,7 @@ class PartA2BboxHead(BaseModule):
        super().init_weights()
        normal_init(self.conv_reg[-1].conv, mean=0, std=0.001)
-    def forward(self, seg_feats, part_feats):
+    def forward(self, seg_feats: Tensor, part_feats: Tensor) -> Tuple[Tensor]:
        """Forward pass.
        Args:
@@ -294,8 +295,10 @@ class PartA2BboxHead(BaseModule):
        return cls_score, bbox_pred
-    def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
-             pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
+             labels: Tensor, bbox_targets: Tensor, pos_gt_bboxes: Tensor,
+             reg_mask: Tensor, label_weights: Tensor,
+             bbox_weights: Tensor) -> Dict:
        """Computing losses.
        Args:
@@ -329,9 +332,9 @@ class PartA2BboxHead(BaseModule):
        pos_inds = (reg_mask > 0)
        if pos_inds.any() == 0:
            # fake a part loss
-            losses['loss_bbox'] = loss_cls.new_tensor(0)
+            losses['loss_bbox'] = loss_cls.new_tensor(0) * loss_cls.sum()
            if self.with_corner_loss:
-                losses['loss_corner'] = loss_cls.new_tensor(0)
+                losses['loss_corner'] = loss_cls.new_tensor(0) * loss_cls.sum()
        else:
            pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
            bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
@@ -367,7 +370,10 @@ class PartA2BboxHead(BaseModule):
        return losses
-    def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
+    def get_targets(self,
+                    sampling_results: SamplingResultList,
+                    rcnn_train_cfg: dict,
+                    concat: bool = True) -> Tuple[Tensor]:
        """Generate targets.
        Args:
@@ -407,7 +413,8 @@ class PartA2BboxHead(BaseModule):
        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
                bbox_weights)
-    def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
+    def _get_target_single(self, pos_bboxes: Tensor, pos_gt_bboxes: Tensor,
+                           ious: Tensor, cfg: dict) -> Tuple[Tensor]:
        """Generate training targets for a single sample.
        Args:
@@ -472,7 +479,10 @@ class PartA2BboxHead(BaseModule):
        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
                bbox_weights)
-    def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0):
+    def get_corner_loss_lidar(self,
+                              pred_bbox3d: Tensor,
+                              gt_bbox3d: Tensor,
+                              delta: float = 1.0) -> Tensor:
        """Calculate corner loss of given boxes.
        Args:
@@ -515,7 +525,7 @@ class PartA2BboxHead(BaseModule):
                    class_labels: Tensor,
                    class_pred: Tensor,
                    input_metas: List[dict],
-                    cfg: dict = None) -> List:
+                    cfg: dict = None) -> InstanceList:
        """Generate bboxes from bbox head predictions.
        Args:
@@ -528,7 +538,17 @@ class PartA2BboxHead(BaseModule):
            cfg (:obj:`ConfigDict`): Testing config.
        Returns:
-            list[tuple]: Decoded bbox, scores and labels after nms.
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
        """
        roi_batch_id = rois[..., 0]
        roi_boxes = rois[..., 1:]  # boxes without batch id
@@ -570,12 +590,12 @@ class PartA2BboxHead(BaseModule):
        return result_list
    def multi_class_nms(self,
-                        box_probs,
+                        box_probs: Tensor,
-                        box_preds,
+                        box_preds: Tensor,
-                        score_thr,
+                        score_thr: float,
-                        nms_thr,
+                        nms_thr: float,
-                        input_meta,
+                        input_meta: dict,
-                        use_rotate_nms=True):
+                        use_rotate_nms: bool = True) -> Tensor:
        """Multi-class NMS for box head.
        Note:

--- a/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
 import numpy as np
 import torch
+import torch.nn as nn
 from mmcv.cnn import ConvModule
 from mmcv.cnn.bricks import build_conv_layer
 from mmengine.model import BaseModule, normal_init
-from torch import nn as nn
+from mmengine.structures import InstanceData
+from torch import Tensor
 from mmdet3d.models.layers import nms_bev, nms_normal_bev
 from mmdet3d.models.layers.pointnet_modules import build_sa_module
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
                                        rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.utils.typing import InstanceList, SamplingResultList
 from mmdet.models.utils import multi_apply
@@ -24,17 +29,17 @@ class PointRCNNBboxHead(BaseModule):
        mlp_channels (list[int]): the number of mlp channels
        pred_layer_cfg (dict, optional): Config of classfication and
            regression prediction layers. Defaults to None.
-        num_points (tuple, optional): The number of points which each SA
+        num_points (tuple): The number of points which each SA
            module samples. Defaults to (128, 32, -1).
-        radius (tuple, optional): Sampling radius of each SA module.
+        radius (tuple): Sampling radius of each SA module.
            Defaults to (0.2, 0.4, 100).
-        num_samples (tuple, optional): The number of samples for ball query
+        num_samples (tuple): The number of samples for ball query
            in each SA module. Defaults to (64, 64, 64).
-        sa_channels (tuple, optional): Out channels of each mlp in SA module.
+        sa_channels (tuple): Out channels of each mlp in SA module.
            Defaults to ((128, 128, 128), (128, 128, 256), (256, 256, 512)).
-        bbox_coder (dict, optional): Config dict of box coders.
+        bbox_coder (dict): Config dict of box coders.
            Defaults to dict(type='DeltaXYZWLHRBBoxCoder').
-        sa_cfg (dict, optional): Config of set abstraction module, which may
+        sa_cfg (dict): Config of set abstraction module, which may
            contain the following keys and values:
            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
@@ -43,52 +48,53 @@ class PointRCNNBboxHead(BaseModule):
              each SA module.
            Defaults to dict(type='PointSAModule', pool_mod='max',
                use_xyz=True).
-        conv_cfg (dict, optional): Config dict of convolutional layers.
+        conv_cfg (dict): Config dict of convolutional layers.
             Defaults to dict(type='Conv1d').
-        norm_cfg (dict, optional): Config dict of normalization layers.
+        norm_cfg (dict): Config dict of normalization layers.
             Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Config dict of activation layers.
+        act_cfg (dict): Config dict of activation layers.
            Defaults to dict(type='ReLU').
-        bias (str, optional): Type of bias. Defaults to 'auto'.
+        bias (str): Type of bias. Defaults to 'auto'.
-        loss_bbox (dict, optional): Config of regression loss function.
+        loss_bbox (dict): Config of regression loss function.
            Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0,
                reduction='sum', loss_weight=1.0).
-        loss_cls (dict, optional): Config of classification loss function.
+        loss_cls (dict): Config of classification loss function.
             Defaults to dict(type='CrossEntropyLoss', use_sigmoid=True,
                reduction='sum', loss_weight=1.0).
-        with_corner_loss (bool, optional): Whether using corner loss.
+        with_corner_loss (bool): Whether using corner loss.
            Defaults to True.
        init_cfg (dict, optional): Config of initialization. Defaults to None.
    """
-    def __init__(
+    def __init__(self,
-            self,
+                 num_classes: dict,
-            num_classes,
+                 in_channels: dict,
-            in_channels,
+                 mlp_channels: dict,
-            mlp_channels,
+                 pred_layer_cfg: Optional[dict] = None,
-            pred_layer_cfg=None,
+                 num_points: dict = (128, 32, -1),
-            num_points=(128, 32, -1),
+                 radius: dict = (0.2, 0.4, 100),
-            radius=(0.2, 0.4, 100),
+                 num_samples: dict = (64, 64, 64),
-            num_samples=(64, 64, 64),
+                 sa_channels: dict = ((128, 128, 128), (128, 128, 256),
-            sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)),
+                                      (256, 256, 512)),
-            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
-            sa_cfg=dict(type='PointSAModule', pool_mod='max', use_xyz=True),
+                 sa_cfg: dict = dict(
-            conv_cfg=dict(type='Conv1d'),
+                     type='PointSAModule', pool_mod='max', use_xyz=True),
-            norm_cfg=dict(type='BN1d'),
+                 conv_cfg: dict = dict(type='Conv1d'),
-            act_cfg=dict(type='ReLU'),
+                 norm_cfg: dict = dict(type='BN1d'),
-            bias='auto',
+                 act_cfg: dict = dict(type='ReLU'),
-            loss_bbox=dict(
+                 bias: str = 'auto',
-                type='SmoothL1Loss',
+                 loss_bbox: dict = dict(
-                beta=1.0 / 9.0,
+                     type='SmoothL1Loss',
-                reduction='sum',
+                     beta=1.0 / 9.0,
-                loss_weight=1.0),
+                     reduction='sum',
-            loss_cls=dict(
+                     loss_weight=1.0),
-                type='CrossEntropyLoss',
+                 loss_cls: dict = dict(
-                use_sigmoid=True,
+                     type='CrossEntropyLoss',
-                reduction='sum',
+                     use_sigmoid=True,
-                loss_weight=1.0),
+                     reduction='sum',
-            with_corner_loss=True,
+                     loss_weight=1.0),
-            init_cfg=None):
+                 with_corner_loss: bool = True,
+                 init_cfg: Optional[dict] = None) -> None:
        super(PointRCNNBboxHead, self).__init__(init_cfg=init_cfg)
        self.num_classes = num_classes
        self.num_sa = len(sa_channels)
@@ -169,7 +175,8 @@ class PointRCNNBboxHead(BaseModule):
        if init_cfg is None:
            self.init_cfg = dict(type='Xavier', layer=['Conv2d', 'Conv1d'])
-    def _add_conv_branch(self, in_channels, conv_channels):
+    def _add_conv_branch(self, in_channels: int,
+                         conv_channels: tuple) -> nn.Sequential:
        """Add shared or separable branch.
        Args:
@@ -203,7 +210,7 @@ class PointRCNNBboxHead(BaseModule):
                    nn.init.constant_(m.bias, 0)
        normal_init(self.conv_reg.weight, mean=0, std=0.001)
-    def forward(self, feats):
+    def forward(self, feats: Tensor) -> Tuple[Tensor]:
        """Forward pass.
        Args:
@@ -239,8 +246,10 @@ class PointRCNNBboxHead(BaseModule):
        rcnn_reg = rcnn_reg.transpose(1, 2).contiguous().squeeze(dim=1)
        return rcnn_cls, rcnn_reg
-    def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
-             pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
+             labels: Tensor, bbox_targets: Tensor, pos_gt_bboxes: Tensor,
+             reg_mask: Tensor, label_weights: Tensor,
+             bbox_weights: Tensor) -> Dict:
        """Computing losses.
        Args:
@@ -302,15 +311,17 @@ class PointRCNNBboxHead(BaseModule):
            # calculate corner loss
            loss_corner = self.get_corner_loss_lidar(pred_boxes3d,
-                                                     pos_gt_bboxes)
+                                                     pos_gt_bboxes).mean()
            losses['loss_corner'] = loss_corner
        else:
-            losses['loss_corner'] = loss_cls.new_tensor(0)
+            losses['loss_corner'] = loss_cls.new_tensor(0) * loss_cls.sum()
        return losses
-    def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0):
+    def get_corner_loss_lidar(self,
+                              pred_bbox3d: Tensor,
+                              gt_bbox3d: Tensor,
+                              delta: float = 1.0) -> Tensor:
        """Calculate corner loss of given boxes.
        Args:
@@ -340,19 +351,24 @@ class PointRCNNBboxHead(BaseModule):
            torch.norm(pred_box_corners - gt_box_corners_flip, dim=2))
        # huber loss
        abs_error = corner_dist.abs()
-        quadratic = abs_error.clamp(max=delta)
+        # quadratic = abs_error.clamp(max=delta)
-        linear = (abs_error - quadratic)
+        # linear = (abs_error - quadratic)
-        corner_loss = 0.5 * quadratic**2 + delta * linear
+        # corner_loss = 0.5 * quadratic**2 + delta * linear
-        return corner_loss.mean(dim=1)
+        loss = torch.where(abs_error < delta, 0.5 * abs_error**2 / delta,
+                           abs_error - 0.5 * delta)
-    def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
+        return loss.mean(dim=1)
+    def get_targets(self,
+                    sampling_results: SamplingResultList,
+                    rcnn_train_cfg: dict,
+                    concat: bool = True) -> Tuple[Tensor]:
        """Generate targets.
        Args:
            sampling_results (list[:obj:`SamplingResult`]):
                Sampled results from rois.
            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
-            concat (bool, optional): Whether to concatenate targets between
+            concat (bool): Whether to concatenate targets between
                batches. Defaults to True.
        Returns:
@@ -385,7 +401,8 @@ class PointRCNNBboxHead(BaseModule):
        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
                bbox_weights)
-    def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
+    def _get_target_single(self, pos_bboxes: Tensor, pos_gt_bboxes: Tensor,
+                           ious: Tensor, cfg: dict) -> Tuple[Tensor]:
        """Generate training targets for a single sample.
        Args:
@@ -449,13 +466,13 @@ class PointRCNNBboxHead(BaseModule):
        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
                bbox_weights)
-    def get_bboxes(self,
+    def get_results(self,
-                   rois,
+                    rois: Tensor,
-                   cls_score,
+                    cls_score: Tensor,
-                   bbox_pred,
+                    bbox_pred: Tensor,
-                   class_labels,
+                    class_labels: Tensor,
-                   img_metas,
+                    input_metas: List[dict],
-                   cfg=None):
+                    cfg: dict = None) -> InstanceList:
        """Generate bboxes from bbox head predictions.
        Args:
@@ -463,12 +480,22 @@ class PointRCNNBboxHead(BaseModule):
            cls_score (torch.Tensor): Scores of bounding boxes.
            bbox_pred (torch.Tensor): Bounding boxes predictions
            class_labels (torch.Tensor): Label of classes
-            img_metas (list[dict]): Point cloud and image's meta info.
+            input_metas (list[dict]): Point cloud and image's meta info.
            cfg (:obj:`ConfigDict`, optional): Testing config.
                Defaults to None.
        Returns:
-            list[tuple]: Decoded bbox, scores and labels after nms.
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
        """
        roi_batch_id = rois[..., 0]
        roi_boxes = rois[..., 1:]  # boxes without batch id
@@ -494,25 +521,27 @@ class PointRCNNBboxHead(BaseModule):
            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
                                        cfg.score_thr, cfg.nms_thr,
-                                        img_metas[batch_id],
+                                        input_metas[batch_id],
                                        cfg.use_rotate_nms)
            selected_bboxes = cur_rcnn_boxes3d[keep]
            selected_label_preds = cur_class_labels[keep]
            selected_scores = cur_cls_score[keep]
+            results = InstanceData()
+            results.bboxes_3d = input_metas[batch_id]['box_type_3d'](
+                selected_bboxes, selected_bboxes.shape[-1])
+            results.scores_3d = selected_scores
+            results.labels_3d = selected_label_preds
-            result_list.append(
+            result_list.append(results)
-                (img_metas[batch_id]['box_type_3d'](selected_bboxes,
-                                                    self.bbox_coder.code_size),
-                 selected_scores, selected_label_preds))
        return result_list
    def multi_class_nms(self,
-                        box_probs,
+                        box_probs: Tensor,
-                        box_preds,
+                        box_preds: Tensor,
-                        score_thr,
+                        score_thr: float,
-                        nms_thr,
+                        nms_thr: float,
-                        input_meta,
+                        input_meta: dict,
-                        use_rotate_nms=True):
+                        use_rotate_nms: bool = True) -> Tensor:
        """Multi-class NMS for box head.
        Note:
@@ -527,7 +556,7 @@ class PointRCNNBboxHead(BaseModule):
            score_thr (float): Threshold of scores.
            nms_thr (float): Threshold for NMS.
            input_meta (dict): Meta information of the current sample.
-            use_rotate_nms (bool, optional): Whether to use rotated nms.
+            use_rotate_nms (bool): Whether to use rotated nms.
                Defaults to True.
        Returns:

--- a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
 import torch
 from mmengine.model import BaseModule
+from torch import Tensor
 from torch import nn as nn
 from torch.nn import functional as F
-from mmdet3d.models.builder import build_loss
 from mmdet3d.registry import MODELS
-from mmdet3d.structures.bbox_3d import rotation_3d_in_axis
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes, rotation_3d_in_axis
 from mmdet3d.utils import InstanceList
 from mmdet.models.utils import multi_apply
@@ -26,23 +28,23 @@ class PointwiseSemanticHead(BaseModule):
        loss_part (dict): Config of part prediction loss.
    """
-    def __init__(self,
+    def __init__(
-                 in_channels,
+        self,
-                 num_classes=3,
+        in_channels: int,
-                 extra_width=0.2,
+        num_classes: int = 3,
-                 seg_score_thr=0.3,
+        extra_width: float = 0.2,
-                 init_cfg=None,
+        seg_score_thr: float = 0.3,
-                 loss_seg=dict(
+        init_cfg: Optional[dict] = None,
-                     type='FocalLoss',
+        loss_seg: dict = dict(
-                     use_sigmoid=True,
+            type='FocalLoss',
-                     reduction='sum',
+            use_sigmoid=True,
-                     gamma=2.0,
+            reduction='sum',
-                     alpha=0.25,
+            gamma=2.0,
-                     loss_weight=1.0),
+            alpha=0.25,
-                 loss_part=dict(
+            loss_weight=1.0),
-                     type='CrossEntropyLoss',
+        loss_part: dict = dict(
-                     use_sigmoid=True,
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)
-                     loss_weight=1.0)):
+    ) -> None:
        super(PointwiseSemanticHead, self).__init__(init_cfg=init_cfg)
        self.extra_width = extra_width
        self.num_classes = num_classes
@@ -50,10 +52,10 @@ class PointwiseSemanticHead(BaseModule):
        self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True)
        self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True)
-        self.loss_seg = build_loss(loss_seg)
+        self.loss_seg = MODELS.build(loss_seg)
-        self.loss_part = build_loss(loss_part)
+        self.loss_part = MODELS.build(loss_part)
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Dict[str, Tensor]:
        """Forward pass.
        Args:
@@ -79,7 +81,9 @@ class PointwiseSemanticHead(BaseModule):
        return dict(
            seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats)
-    def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d):
+    def get_targets_single(self, voxel_centers: Tensor,
+                           gt_bboxes_3d: BaseInstance3DBoxes,
+                           gt_labels_3d: Tensor) -> Tuple[Tensor]:
        """generate segmentation and part prediction targets for a single
        sample.
@@ -162,7 +166,8 @@ class PointwiseSemanticHead(BaseModule):
        part_targets = torch.cat(part_targets, dim=0)
        return dict(seg_targets=seg_targets, part_targets=part_targets)
-    def loss(self, semantic_results, semantic_targets):
+    def loss(self, semantic_results: dict,
+             semantic_targets: dict) -> Dict[str, Tensor]:
        """Calculate point-wise segmentation and part prediction losses.
        Args:

--- a/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import torch
 from mmcv.cnn import ConvModule
@@ -12,6 +12,7 @@ from torch.nn import functional as F
 from mmdet3d.models.layers import VoteModule, build_sa_module
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
 from mmdet.models.utils import multi_apply
@@ -26,39 +27,42 @@ class PrimitiveHead(BaseModule):
            available mode ['z', 'xy', 'line'].
        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
            decoding boxes.
-        train_cfg (dict): Config for training.
+        train_cfg (dict, optional): Config for training.
-        test_cfg (dict): Config for testing.
+        test_cfg (dict, optional): Config for testing.
-        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
+        vote_module_cfg (dict, optional): Config of VoteModule for point-wise
-        vote_aggregation_cfg (dict): Config of vote aggregation layer.
+            votes.
+        vote_aggregation_cfg (dict, optional): Config of vote aggregation
+            layer.
        feat_channels (tuple[int]): Convolution channels of
            prediction layer.
        upper_thresh (float): Threshold for line matching.
        surface_thresh (float): Threshold for surface matching.
-        conv_cfg (dict): Config of convolution in prediction layer.
+        conv_cfg (dict, optional): Config of convolution in prediction layer.
-        norm_cfg (dict): Config of BN in prediction layer.
+        norm_cfg (dict, optional): Config of BN in prediction layer.
-        objectness_loss (dict): Config of objectness loss.
+        objectness_loss (dict, optional): Config of objectness loss.
-        center_loss (dict): Config of center loss.
+        center_loss (dict, optional): Config of center loss.
-        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+        semantic_loss (dict, optional): Config of point-wise semantic
+            segmentation loss.
    """
    def __init__(self,
                 num_dims: int,
                 num_classes: int,
                 primitive_mode: str,
-                 train_cfg: dict = None,
+                 train_cfg: Optional[dict] = None,
-                 test_cfg: dict = None,
+                 test_cfg: Optional[dict] = None,
-                 vote_module_cfg: dict = None,
+                 vote_module_cfg: Optional[dict] = None,
-                 vote_aggregation_cfg: dict = None,
+                 vote_aggregation_cfg: Optional[dict] = None,
                 feat_channels: tuple = (128, 128),
                 upper_thresh: float = 100.0,
                 surface_thresh: float = 0.5,
                 conv_cfg: dict = dict(type='Conv1d'),
                 norm_cfg: dict = dict(type='BN1d'),
-                 objectness_loss: dict = None,
+                 objectness_loss: Optional[dict] = None,
-                 center_loss: dict = None,
+                 center_loss: Optional[dict] = None,
-                 semantic_reg_loss: dict = None,
+                 semantic_reg_loss: Optional[dict] = None,
-                 semantic_cls_loss: dict = None,
+                 semantic_cls_loss: Optional[dict] = None,
-                 init_cfg: dict = None):
+                 init_cfg: Optional[dict] = None):
        super(PrimitiveHead, self).__init__(init_cfg=init_cfg)
        # bounding boxes centers,  face centers and edge centers
        assert primitive_mode in ['z', 'xy', 'line']
@@ -126,7 +130,7 @@ class PrimitiveHead(BaseModule):
        assert sample_mode in ['vote', 'seed', 'random']
        return sample_mode
-    def forward(self, feats_dict):
+    def forward(self, feats_dict: dict) -> dict:
        """Forward pass.
        Args:
@@ -255,10 +259,8 @@ class PrimitiveHead(BaseModule):
                attributes.
            batch_pts_semantic_mask (list[tensor]): Semantic mask
                of points cloud. Defaults to None.
-            batch_pts_semantic_mask (list[tensor]): Instance mask
+            batch_pts_instance_mask (list[tensor]): Instance mask
                of points cloud. Defaults to None.
-            batch_input_metas (list[dict]): Contain pcd and img's meta info.
-            ret_target (bool): Return targets or not. Defaults to False.
        Returns:
            dict: Losses of Primitive Head.
@@ -392,12 +394,13 @@ class PrimitiveHead(BaseModule):
        return (point_mask, point_offset, gt_primitive_center,
                gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask)
-    def get_targets_single(self,
+    def get_targets_single(
-                           points,
+            self,
-                           gt_bboxes_3d,
+            points: torch.Tensor,
-                           gt_labels_3d,
+            gt_bboxes_3d: BaseInstance3DBoxes,
-                           pts_semantic_mask=None,
+            gt_labels_3d: torch.Tensor,
-                           pts_instance_mask=None):
+            pts_semantic_mask: torch.Tensor = None,
+            pts_instance_mask: torch.Tensor = None) -> Tuple[torch.Tensor]:
        """Generate targets of primitive head for single batch.
        Args:
@@ -668,7 +671,8 @@ class PrimitiveHead(BaseModule):
        return (point_mask, point_sem, point_offset)
-    def primitive_decode_scores(self, predictions, aggregated_points):
+    def primitive_decode_scores(self, predictions: torch.Tensor,
+                                aggregated_points: torch.Tensor) -> dict:
        """Decode predicted parts to primitive head.
        Args:
@@ -696,7 +700,7 @@ class PrimitiveHead(BaseModule):
        return ret_dict
-    def check_horizon(self, points):
+    def check_horizon(self, points: torch.Tensor) -> bool:
        """Check whether is a horizontal plane.
        Args:
@@ -709,7 +713,8 @@ class PrimitiveHead(BaseModule):
               (points[1][-1] == points[2][-1]) and \
               (points[2][-1] == points[3][-1])
-    def check_dist(self, plane_equ, points):
+    def check_dist(self, plane_equ: torch.Tensor,
+                   points: torch.Tensor) -> tuple:
        """Whether the mean of points to plane distance is lower than thresh.
        Args:
@@ -722,7 +727,8 @@ class PrimitiveHead(BaseModule):
        return (points[:, 2] +
                plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh']
-    def point2line_dist(self, points, pts_a, pts_b):
+    def point2line_dist(self, points: torch.Tensor, pts_a: torch.Tensor,
+                        pts_b: torch.Tensor) -> torch.Tensor:
        """Calculate the distance from point to line.
        Args:
@@ -741,7 +747,11 @@ class PrimitiveHead(BaseModule):
        return dist
-    def match_point2line(self, points, corners, with_yaw, mode='bottom'):
+    def match_point2line(self,
+                         points: torch.Tensor,
+                         corners: torch.Tensor,
+                         with_yaw: bool,
+                         mode: str = 'bottom') -> tuple:
        """Match points to corresponding line.
        Args:
@@ -782,7 +792,8 @@ class PrimitiveHead(BaseModule):
            selected_list = [sel1, sel2, sel3, sel4]
        return selected_list
-    def match_point2plane(self, plane, points):
+    def match_point2plane(self, plane: torch.Tensor,
+                          points: torch.Tensor) -> tuple:
        """Match points to plane.
        Args:
@@ -800,10 +811,14 @@ class PrimitiveHead(BaseModule):
                             min_dist) < self.train_cfg['dist_thresh']
        return point2plane_dist, selected
-    def compute_primitive_loss(self, primitive_center, primitive_semantic,
+    def compute_primitive_loss(self, primitive_center: torch.Tensor,
-                               semantic_scores, num_proposal,
+                               primitive_semantic: torch.Tensor,
-                               gt_primitive_center, gt_primitive_semantic,
+                               semantic_scores: torch.Tensor,
-                               gt_sem_cls_label, gt_primitive_mask):
+                               num_proposal: torch.Tensor,
+                               gt_primitive_center: torch.Tensor,
+                               gt_primitive_semantic: torch.Tensor,
+                               gt_sem_cls_label: torch.Tensor,
+                               gt_primitive_mask: torch.Tensor) -> Tuple:
        """Compute loss of primitive module.
        Args:
@@ -849,7 +864,8 @@ class PrimitiveHead(BaseModule):
        return center_loss, size_loss, sem_cls_loss
-    def get_primitive_center(self, pred_flag, center):
+    def get_primitive_center(self, pred_flag: torch.Tensor,
+                             center: torch.Tensor) -> Tuple:
        """Generate primitive center from predictions.
        Args:
@@ -869,17 +885,17 @@ class PrimitiveHead(BaseModule):
        return center, pred_indices
    def _assign_primitive_line_targets(self,
-                                       point_mask,
+                                       point_mask: torch.Tensor,
-                                       point_offset,
+                                       point_offset: torch.Tensor,
-                                       point_sem,
+                                       point_sem: torch.Tensor,
-                                       coords,
+                                       coords: torch.Tensor,
-                                       indices,
+                                       indices: torch.Tensor,
-                                       cls_label,
+                                       cls_label: int,
-                                       point2line_matching,
+                                       point2line_matching: torch.Tensor,
-                                       corners,
+                                       corners: torch.Tensor,
-                                       center_axises,
+                                       center_axises: torch.Tensor,
-                                       with_yaw,
+                                       with_yaw: bool,
-                                       mode='bottom'):
+                                       mode: str = 'bottom') -> Tuple:
        """Generate targets of line primitive.
        Args:
@@ -934,15 +950,15 @@ class PrimitiveHead(BaseModule):
        return point_mask, point_offset, point_sem
    def _assign_primitive_surface_targets(self,
-                                          point_mask,
+                                          point_mask: torch.Tensor,
-                                          point_offset,
+                                          point_offset: torch.Tensor,
-                                          point_sem,
+                                          point_sem: torch.Tensor,
-                                          coords,
+                                          coords: torch.Tensor,
-                                          indices,
+                                          indices: torch.Tensor,
-                                          cls_label,
+                                          cls_label: int,
-                                          corners,
+                                          corners: torch.Tensor,
-                                          with_yaw,
+                                          with_yaw: bool,
-                                          mode='bottom'):
+                                          mode: str = 'bottom') -> Tuple:
        """Generate targets for primitive z and primitive xy.
        Args:
@@ -1017,7 +1033,9 @@ class PrimitiveHead(BaseModule):
        point_offset[indices] = center - coords
        return point_mask, point_offset, point_sem
-    def _get_plane_fomulation(self, vector1, vector2, point):
+    def _get_plane_fomulation(self, vector1: torch.Tensor,
+                              vector2: torch.Tensor,
+                              point: torch.Tensor) -> torch.Tensor:
        """Compute the equation of the plane.
        Args:

--- a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
+++ b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
@@ -90,16 +90,18 @@ class PartAggregationROIHead(Base3DRoIHead):
        return bbox_results
    def _assign_and_sample(
-            self, proposal_list: InstanceList,
+            self, rpn_results_list: InstanceList,
-            batch_gt_instances_3d: InstanceList) -> List[SamplingResult]:
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances_ignore: InstanceList) -> List[SamplingResult]:
        """Assign and sample proposals for training.
        Args:
-            proposal_list (list[:obj:`InstancesData`]): Proposals produced by
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
-                rpn head.
+                of rpn head.
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
                gt_instances. It usually includes ``bboxes_3d`` and
                ``labels_3d`` attributes.
+            batch_gt_instances_ignore (list): Ignore instances of gt bboxes.
        Returns:
            list[:obj:`SamplingResult`]: Sampled results of each training
@@ -107,16 +109,16 @@ class PartAggregationROIHead(Base3DRoIHead):
        """
        sampling_results = []
        # bbox assign
-        for batch_idx in range(len(proposal_list)):
+        for batch_idx in range(len(rpn_results_list)):
-            cur_proposal_list = proposal_list[batch_idx]
+            cur_proposal_list = rpn_results_list[batch_idx]
            cur_boxes = cur_proposal_list['bboxes_3d']
            cur_labels_3d = cur_proposal_list['labels_3d']
            cur_gt_instances_3d = batch_gt_instances_3d[batch_idx]
+            cur_gt_instances_ignore = batch_gt_instances_ignore[batch_idx]
            cur_gt_instances_3d.bboxes_3d = cur_gt_instances_3d.\
                bboxes_3d.tensor
-            cur_gt_bboxes = batch_gt_instances_3d[batch_idx].bboxes_3d.to(
+            cur_gt_bboxes = cur_gt_instances_3d.bboxes_3d.to(cur_boxes.device)
-                cur_boxes.device)
+            cur_gt_labels = cur_gt_instances_3d.labels_3d
-            cur_gt_labels = batch_gt_instances_3d[batch_idx].labels_3d
            batch_num_gts = 0
            # 0 is bg
@@ -132,7 +134,8 @@ class PartAggregationROIHead(Base3DRoIHead):
                    pred_per_cls = (cur_labels_3d == i)
                    cur_assign_res = assigner.assign(
                        cur_proposal_list[pred_per_cls],
-                        cur_gt_instances_3d[gt_per_cls])
+                        cur_gt_instances_3d[gt_per_cls],
+                        cur_gt_instances_ignore)
                    # gather assign_results in different class into one result
                    batch_num_gts += cur_assign_res.num_gts
                    # gt inds (1-based)
@@ -158,7 +161,8 @@ class PartAggregationROIHead(Base3DRoIHead):
                                             batch_gt_labels)
            else:  # for single class
                assign_result = self.bbox_assigner.assign(
-                    cur_proposal_list, cur_gt_instances_3d)
+                    cur_proposal_list, cur_gt_instances_3d,
+                    cur_gt_instances_ignore)
            # sample boxes
            sampling_result = self.bbox_sampler.sample(assign_result,
                                                       cur_boxes.tensor,
@@ -200,7 +204,7 @@ class PartAggregationROIHead(Base3DRoIHead):
        Args:
            feats_dict (dict): Contains features from the first stage.
-            rpn_results_list (List[:obj:`InstancesData`]): Detection results
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
                of rpn head.
            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
                samples. It usually includes information such as
@@ -247,7 +251,7 @@ class PartAggregationROIHead(Base3DRoIHead):
            voxel_dict (dict): Contains information of voxels.
            batch_input_metas (list[dict], Optional): Batch image meta info.
                Defaults to None.
-            rpn_results_list (List[:obj:`InstancesData`]): Detection results
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
                of rpn head.
            test_cfg (Config): Test config.
@@ -316,7 +320,7 @@ class PartAggregationROIHead(Base3DRoIHead):
        Args:
            feats_dict (dict): Contains features from the first stage.
-            rpn_results_list (List[:obj:`InstancesData`]): Detection results
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
                of rpn head.
            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
                samples. It usually includes information such as
@@ -342,7 +346,8 @@ class PartAggregationROIHead(Base3DRoIHead):
            losses.update(semantic_results.pop('loss_semantic'))
        sample_results = self._assign_and_sample(rpn_results_list,
-                                                 batch_gt_instances_3d)
+                                                 batch_gt_instances_3d,
+                                                 batch_gt_instances_ignore)
        if self.with_bbox:
            feats_dict.update(semantic_results)
            bbox_results = self._bbox_forward_train(feats_dict, voxels_dict,
@@ -358,7 +363,7 @@ class PartAggregationROIHead(Base3DRoIHead):
        Args:
            feats_dict (dict): Contains features from the first stage.
-            rpn_results_list (List[:obj:`InstancesData`]): Detection results
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
                of rpn head.
        Returns:

--- a/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
+++ b/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
 import torch
+from torch import Tensor
 from torch.nn import functional as F
 from mmdet3d.registry import MODELS, TASK_UTILS
-from mmdet3d.structures import bbox3d2result, bbox3d2roi
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.utils.typing import InstanceList, SampleList
 from mmdet.models.task_modules import AssignResult
 from .base_3droi_head import Base3DRoIHead
@@ -14,43 +18,31 @@ class PointRCNNRoIHead(Base3DRoIHead):
    Args:
        bbox_head (dict): Config of bbox_head.
-        point_roi_extractor (dict): Config of RoI extractor.
+        bbox_roi_extractor (dict): Config of RoI extractor.
        train_cfg (dict): Train configs.
        test_cfg (dict): Test configs.
-        depth_normalizer (float, optional): Normalize depth feature.
+        depth_normalizer (float): Normalize depth feature.
            Defaults to 70.0.
        init_cfg (dict, optional): Config of initialization. Defaults to None.
    """
    def __init__(self,
-                 bbox_head,
+                 bbox_head: dict,
-                 point_roi_extractor,
+                 bbox_roi_extractor: dict,
-                 train_cfg,
+                 train_cfg: dict,
-                 test_cfg,
+                 test_cfg: dict,
-                 depth_normalizer=70.0,
+                 depth_normalizer: dict = 70.0,
-                 pretrained=None,
+                 init_cfg: Optional[dict] = None) -> None:
-                 init_cfg=None):
        super(PointRCNNRoIHead, self).__init__(
            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            pretrained=pretrained,
            init_cfg=init_cfg)
        self.depth_normalizer = depth_normalizer
-        if point_roi_extractor is not None:
-            self.point_roi_extractor = MODELS.build(point_roi_extractor)
        self.init_assigner_sampler()
-    def init_bbox_head(self, bbox_head):
-        """Initialize box head.
-        Args:
-            bbox_head (dict): Config dict of RoI Head.
-        """
-        self.bbox_head = MODELS.build(bbox_head)
    def init_mask_head(self):
        """Initialize maek head."""
        pass
@@ -68,77 +60,101 @@ class PointRCNNRoIHead(Base3DRoIHead):
                ]
            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
-    def forward_train(self, feats_dict, input_metas, proposal_list,
+    def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
-                      gt_bboxes_3d, gt_labels_3d):
+             batch_data_samples: SampleList, **kwargs) -> dict:
-        """Training forward function of PointRCNNRoIHead.
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
        Args:
            feats_dict (dict): Contains features from the first stage.
-            imput_metas (list[dict]): Meta info of each input.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
-            proposal_list (list[dict]): Proposal information from rpn.
+                of rpn head.
-                The dictionary should contain the following keys:
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
-                - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
-                - labels_3d (torch.Tensor): Labels of proposals
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):
-                GT bboxes of each sample. The bboxes are encapsulated
-                by 3D box bboxes_3d.
-            gt_labels_3d (list[LongTensor]): GT labels of each sample.
        Returns:
-            dict: Losses from RoI RCNN head.
+            dict[str, Tensor]: A dictionary of loss components
-                - loss_bbox (torch.Tensor): Loss of bboxes
        """
-        features = feats_dict['features']
+        features = feats_dict['fp_features']
-        points = feats_dict['points']
+        fp_points = feats_dict['fp_points']
        point_cls_preds = feats_dict['points_cls_preds']
        sem_scores = point_cls_preds.sigmoid()
        point_scores = sem_scores.max(-1)[0]
+        batch_gt_instances_3d = []
-        sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
+        batch_gt_instances_ignore = []
-                                                 gt_labels_3d)
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d,
+                                                 batch_gt_instances_ignore)
        # concat the depth, semantic features and backbone features
        features = features.transpose(1, 2).contiguous()
-        point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5
+        point_depths = fp_points.norm(dim=2) / self.depth_normalizer - 0.5
        features_list = [
            point_scores.unsqueeze(2),
            point_depths.unsqueeze(2), features
        ]
        features = torch.cat(features_list, dim=2)
-        bbox_results = self._bbox_forward_train(features, points,
+        bbox_results = self._bbox_forward_train(features, fp_points,
                                                sample_results)
        losses = dict()
        losses.update(bbox_results['loss_bbox'])
        return losses
-    def simple_test(self, feats_dict, img_metas, proposal_list, **kwargs):
+    def predict(self,
-        """Simple testing forward function of PointRCNNRoIHead.
+                feats_dict: Dict,
+                rpn_results_list: InstanceList,
-        Note:
+                batch_data_samples: SampleList,
-            This function assumes that the batch size is 1
+                rescale: bool = False,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
        Args:
            feats_dict (dict): Contains features from the first stage.
-            img_metas (list[dict]): Meta info of each image.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
-            proposal_list (list[dict]): Proposal information from rpn.
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
        Returns:
-            dict: Bbox results of one frame.
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
        """
-        rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
+        rois = bbox3d2roi(
-        labels_3d = [res['labels_3d'] for res in proposal_list]
+            [res['bboxes_3d'].tensor for res in rpn_results_list])
+        labels_3d = [res['labels_3d'] for res in rpn_results_list]
-        features = feats_dict['features']
+        batch_input_metas = [
-        points = feats_dict['points']
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        fp_features = feats_dict['fp_features']
+        fp_points = feats_dict['fp_points']
        point_cls_preds = feats_dict['points_cls_preds']
        sem_scores = point_cls_preds.sigmoid()
        point_scores = sem_scores.max(-1)[0]
-        features = features.transpose(1, 2).contiguous()
+        features = fp_features.transpose(1, 2).contiguous()
-        point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5
+        point_depths = fp_points.norm(dim=2) / self.depth_normalizer - 0.5
        features_list = [
            point_scores.unsqueeze(2),
            point_depths.unsqueeze(2), features
@@ -146,29 +162,27 @@ class PointRCNNRoIHead(Base3DRoIHead):
        features = torch.cat(features_list, dim=2)
        batch_size = features.shape[0]
-        bbox_results = self._bbox_forward(features, points, batch_size, rois)
+        bbox_results = self._bbox_forward(features, fp_points, batch_size,
+                                          rois)
        object_score = bbox_results['cls_score'].sigmoid()
-        bbox_list = self.bbox_head.get_bboxes(
+        bbox_list = self.bbox_head.get_results(
            rois,
            object_score,
            bbox_results['bbox_pred'],
            labels_3d,
-            img_metas,
+            batch_input_metas,
            cfg=self.test_cfg)
-        bbox_results = [
+        return bbox_list
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-    def _bbox_forward_train(self, features, points, sampling_results):
+    def _bbox_forward_train(self, features: Tensor, points: Tensor,
+                            sampling_results: SampleList) -> dict:
        """Forward training function of roi_extractor and bbox_head.
        Args:
            features (torch.Tensor): Backbone features with depth and \
                semantic features.
-            points (torch.Tensor): Pointcloud.
+            points (torch.Tensor): Point cloud.
            sampling_results (:obj:`SamplingResult`): Sampled results used
                for training.
@@ -188,14 +202,15 @@ class PointRCNNRoIHead(Base3DRoIHead):
        bbox_results.update(loss_bbox=loss_bbox)
        return bbox_results
-    def _bbox_forward(self, features, points, batch_size, rois):
+    def _bbox_forward(self, features: Tensor, points: Tensor, batch_size: int,
+                      rois: Tensor) -> dict:
        """Forward function of roi_extractor and bbox_head used in both
        training and testing.
        Args:
            features (torch.Tensor): Backbone features with depth and
                semantic features.
-            points (torch.Tensor): Pointcloud.
+            points (torch.Tensor): Point cloud.
            batch_size (int): Batch size.
            rois (torch.Tensor): RoI boxes.
@@ -203,21 +218,27 @@ class PointRCNNRoIHead(Base3DRoIHead):
            dict: Contains predictions of bbox_head and
                features of roi_extractor.
        """
-        pooled_point_feats = self.point_roi_extractor(features, points,
+        pooled_point_feats = self.bbox_roi_extractor(features, points,
-                                                      batch_size, rois)
+                                                     batch_size, rois)
        cls_score, bbox_pred = self.bbox_head(pooled_point_feats)
        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
        return bbox_results
-    def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
+    def _assign_and_sample(
+            self, rpn_results_list: InstanceList,
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances_ignore: InstanceList) -> SampleList:
        """Assign and sample proposals for training.
        Args:
-            proposal_list (list[dict]): Proposals produced by RPN.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                of rpn head.
-                boxes.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`]): Ignore
+                instances of gt bboxes.
        Returns:
            list[:obj:`SamplingResult`]: Sampled results of each training
@@ -225,12 +246,16 @@ class PointRCNNRoIHead(Base3DRoIHead):
        """
        sampling_results = []
        # bbox assign
-        for batch_idx in range(len(proposal_list)):
+        for batch_idx in range(len(rpn_results_list)):
-            cur_proposal_list = proposal_list[batch_idx]
+            cur_proposal_list = rpn_results_list[batch_idx]
-            cur_boxes = cur_proposal_list['boxes_3d']
+            cur_boxes = cur_proposal_list['bboxes_3d']
            cur_labels_3d = cur_proposal_list['labels_3d']
-            cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
+            cur_gt_instances_3d = batch_gt_instances_3d[batch_idx]
-            cur_gt_labels = gt_labels_3d[batch_idx]
+            cur_gt_instances_3d.bboxes_3d = cur_gt_instances_3d.\
+                bboxes_3d.tensor
+            cur_gt_instances_ignore = batch_gt_instances_ignore[batch_idx]
+            cur_gt_bboxes = cur_gt_instances_3d.bboxes_3d.to(cur_boxes.device)
+            cur_gt_labels = cur_gt_instances_3d.labels_3d
            batch_num_gts = 0
            # 0 is bg
            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
@@ -244,9 +269,9 @@ class PointRCNNRoIHead(Base3DRoIHead):
                    gt_per_cls = (cur_gt_labels == i)
                    pred_per_cls = (cur_labels_3d == i)
                    cur_assign_res = assigner.assign(
-                        cur_boxes.tensor[pred_per_cls],
+                        cur_proposal_list[pred_per_cls],
-                        cur_gt_bboxes.tensor[gt_per_cls],
+                        cur_gt_instances_3d[gt_per_cls],
-                        gt_labels=cur_gt_labels[gt_per_cls])
+                        cur_gt_instances_ignore)
                    # gather assign_results in different class into one result
                    batch_num_gts += cur_assign_res.num_gts
                    # gt inds (1-based)
@@ -272,14 +297,13 @@ class PointRCNNRoIHead(Base3DRoIHead):
                                             batch_gt_labels)
            else:  # for single class
                assign_result = self.bbox_assigner.assign(
-                    cur_boxes.tensor,
+                    cur_proposal_list, cur_gt_instances_3d,
-                    cur_gt_bboxes.tensor,
+                    cur_gt_instances_ignore)
-                    gt_labels=cur_gt_labels)
            # sample boxes
            sampling_result = self.bbox_sampler.sample(assign_result,
                                                       cur_boxes.tensor,
-                                                       cur_gt_bboxes.tensor,
+                                                       cur_gt_bboxes,
                                                       cur_gt_labels)
            sampling_results.append(sampling_result)
        return sampling_results
--- a/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
+++ b/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
 import torch
+import torch.nn as nn
 from mmcv import ops
 from mmengine.model import BaseModule
+from torch import Tensor
 from mmdet3d.registry import MODELS
@@ -13,14 +17,16 @@ class Single3DRoIAwareExtractor(BaseModule):
    Extract Point-wise roi features.
    Args:
-        roi_layer (dict): The config of roi layer.
+        roi_layer (dict, optional): The config of roi layer.
    """
-    def __init__(self, roi_layer=None, init_cfg=None):
+    def __init__(self,
+                 roi_layer: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None) -> None:
        super(Single3DRoIAwareExtractor, self).__init__(init_cfg=init_cfg)
        self.roi_layer = self.build_roi_layers(roi_layer)
-    def build_roi_layers(self, layer_cfg):
+    def build_roi_layers(self, layer_cfg: dict) -> nn.Module:
        """Build roi layers using `layer_cfg`"""
        cfg = layer_cfg.copy()
        layer_type = cfg.pop('type')
@@ -29,7 +35,8 @@ class Single3DRoIAwareExtractor(BaseModule):
        roi_layers = layer_cls(**cfg)
        return roi_layers
-    def forward(self, feats, coordinate, batch_inds, rois):
+    def forward(self, feats: Tensor, coordinate: Tensor, batch_inds: Tensor,
+                rois: Tensor) -> Tensor:
        """Extract point-wise roi features.
        Args:

--- a/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
+++ b/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
 import torch
+import torch.nn as nn
 from mmcv import ops
-from torch import nn as nn
+from torch import Tensor
 from mmdet3d.registry import MODELS
 from mmdet3d.structures.bbox_3d import rotation_3d_in_axis
@@ -14,14 +17,14 @@ class Single3DRoIPointExtractor(nn.Module):
    Extract Point-wise roi features.
    Args:
-        roi_layer (dict): The config of roi layer.
+        roi_layer (dict, optional): The config of roi layer.
    """
-    def __init__(self, roi_layer=None):
+    def __init__(self, roi_layer: Optional[dict] = None) -> None:
        super(Single3DRoIPointExtractor, self).__init__()
        self.roi_layer = self.build_roi_layers(roi_layer)
-    def build_roi_layers(self, layer_cfg):
+    def build_roi_layers(self, layer_cfg: dict) -> nn.Module:
        """Build roi layers using `layer_cfg`"""
        cfg = layer_cfg.copy()
        layer_type = cfg.pop('type')
@@ -30,7 +33,8 @@ class Single3DRoIPointExtractor(nn.Module):
        roi_layers = layer_cls(**cfg)
        return roi_layers
-    def forward(self, feats, coordinate, batch_inds, rois):
+    def forward(self, feats: Tensor, coordinate: Tensor, batch_inds: Tensor,
+                rois: Tensor) -> Tensor:
        """Extract point-wise roi features.
        Args:

--- a/mmdet3d/structures/bbox_3d/box_3d_mode.py
+++ b/mmdet3d/structures/bbox_3d/box_3d_mode.py
@@ -41,7 +41,7 @@ class Box3DMode(IntEnum):
             v
        down y
-    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
    and the yaw is around the y axis, thus the rotation axis=1.
    Coordinates in Depth mode:
@@ -63,7 +63,7 @@ class Box3DMode(IntEnum):
    DEPTH = 2
    @staticmethod
-    def convert(box, src, dst, rt_mat=None, with_yaw=True):
+    def convert(box, src, dst, rt_mat=None, with_yaw=True, correct_yaw=False):
        """Convert boxes from `src` mode to `dst` mode.
        Args:
@@ -81,6 +81,7 @@ class Box3DMode(IntEnum):
            with_yaw (bool, optional): If `box` is an instance of
                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
        Returns:
            (tuple | list | np.ndarray | torch.Tensor |
@@ -119,41 +120,89 @@ class Box3DMode(IntEnum):
                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
-                yaw = -yaw - np.pi / 2
+                if correct_yaw:
-                yaw = limit_period(yaw, period=np.pi * 2)
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
-                yaw = -yaw - np.pi / 2
+                if correct_yaw:
-                yaw = limit_period(yaw, period=np.pi * 2)
+                    yaw_vector = torch.cat([
+                        torch.cos(-yaw),
+                        torch.zeros_like(yaw),
+                        torch.sin(-yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
-                yaw = -yaw
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw
        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
-                yaw = -yaw
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(-yaw),
+                        torch.zeros_like(yaw),
+                        torch.sin(-yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw
        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
            if with_yaw:
-                yaw = yaw + np.pi / 2
+                if correct_yaw:
-                yaw = limit_period(yaw, period=np.pi * 2)
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = yaw + np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
            if with_yaw:
-                yaw = yaw - np.pi / 2
+                if correct_yaw:
-                yaw = limit_period(yaw, period=np.pi * 2)
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
        else:
            raise NotImplementedError(
                f'Conversion from Box3DMode {src} to {dst} '
@@ -168,6 +217,18 @@ class Box3DMode(IntEnum):
        else:
            xyz = arr[..., :3] @ rt_mat.t()
+        # Note: we only use rotation in rt_mat
+        # so don't need to extend yaw_vector
+        if with_yaw and correct_yaw:
+            rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t()
+            if dst == Box3DMode.CAM:
+                yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:,
+                                                                          [0]])
+            elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]:
+                yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:,
+                                                                         [0]])
+            yaw = limit_period(yaw, period=np.pi * 2)
        if with_yaw:
            remains = arr[..., 7:]
            arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)