Bump version to V1.0.0rc0

Bump version to V1.0.0rc0

Bump version to V1.0.0rc0
32a4328b · Wenwei Zhang · GitHub · 86cc487c · a8817998 · 32a4328b
Unverified Commit 32a4328b authored Feb 24, 2022 by Wenwei Zhang Committed by GitHub Feb 24, 2022
20 changed files
--- a/mmdet3d/models/backbones/nostem_regnet.py
+++ b/mmdet3d/models/backbones/nostem_regnet.py
@@ -16,7 +16,7 @@ class NoStemRegNet(RegNet):
            - wm (float): Quantization parameter to quantize the width.
            - depth (int): Depth of the backbone.
            - group_w (int): Width of group.
-            - bot_mul (float): Bottleneck ratio, i.e. expansion of bottlneck.
+            - bot_mul (float): Bottleneck ratio, i.e. expansion of bottleneck.
        strides (Sequence[int]): Strides of the first block of each stage.
        base_channels (int): Base channels after stem layer.
        in_channels (int): Number of input image channels. Normally 3.

--- a/mmdet3d/models/backbones/pointnet2_sa_msg.py
+++ b/mmdet3d/models/backbones/pointnet2_sa_msg.py
@@ -64,7 +64,11 @@ class PointNet2SAMSG(BasePointNet):
        self.out_indices = out_indices
        assert max(out_indices) < self.num_sa
        assert len(num_points) == len(radii) == len(num_samples) == len(
-            sa_channels) == len(aggregation_channels)
+            sa_channels)
+        if aggregation_channels is not None:
+            assert len(sa_channels) == len(aggregation_channels)
+        else:
+            aggregation_channels = [None] * len(sa_channels)

        self.SA_modules = nn.ModuleList()
        self.aggregation_mlps = nn.ModuleList()
@@ -134,7 +138,7 @@ class PointNet2SAMSG(BasePointNet):
                - sa_xyz (torch.Tensor): The coordinates of sa features.
                - sa_features (torch.Tensor): The features from the
                    last Set Aggregation Layers.
-                - sa_indices (torch.Tensor): Indices of the \
+                - sa_indices (torch.Tensor): Indices of the
                    input points.
        """
        xyz, features = self._split_point_feats(points)

--- a/mmdet3d/models/backbones/pointnet2_sa_ssg.py
+++ b/mmdet3d/models/backbones/pointnet2_sa_ssg.py
@@ -97,11 +97,11 @@ class PointNet2SASSG(BasePointNet):
        Returns:
            dict[str, list[torch.Tensor]]: Outputs after SA and FP modules.

-                - fp_xyz (list[torch.Tensor]): The coordinates of \
+                - fp_xyz (list[torch.Tensor]): The coordinates of
                    each fp features.
-                - fp_features (list[torch.Tensor]): The features \
+                - fp_features (list[torch.Tensor]): The features
                    from each Feature Propagate Layers.
-                - fp_indices (list[torch.Tensor]): Indices of the \
+                - fp_indices (list[torch.Tensor]): Indices of the
                    input points.
        """
        xyz, features = self._split_point_feats(points)

--- a/mmdet3d/models/backbones/second.py
+++ b/mmdet3d/models/backbones/second.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+
 from mmcv.cnn import build_conv_layer, build_norm_layer
 from mmcv.runner import BaseModule
 from torch import nn as nn

--- a/mmdet3d/models/builder.py
+++ b/mmdet3d/models/builder.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+
 from mmcv.cnn import MODELS as MMCV_MODELS
 from mmcv.utils import Registry


--- a/mmdet3d/models/decode_heads/__init__.py
+++ b/mmdet3d/models/decode_heads/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from .dgcnn_head import DGCNNHead
 from .paconv_head import PAConvHead
 from .pointnet2_head import PointNet2Head

-__all__ = ['PointNet2Head', 'PAConvHead']
+__all__ = ['PointNet2Head', 'DGCNNHead', 'PAConvHead']
--- a/mmdet3d/models/decode_heads/decode_head.py
+++ b/mmdet3d/models/decode_heads/decode_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
+
 from mmcv.cnn import normal_init
 from mmcv.runner import BaseModule, auto_fp16, force_fp32
 from torch import nn as nn
@@ -13,17 +14,18 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
    Args:
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
-        dropout_ratio (float): Ratio of dropout layer. Default: 0.5.
-        conv_cfg (dict|None): Config of conv layers.
+        dropout_ratio (float, optional): Ratio of dropout layer. Default: 0.5.
+        conv_cfg (dict, optional): Config of conv layers.
            Default: dict(type='Conv1d').
-        norm_cfg (dict|None): Config of norm layers.
+        norm_cfg (dict, optional): Config of norm layers.
            Default: dict(type='BN1d').
-        act_cfg (dict): Config of activation layers.
+        act_cfg (dict, optional): Config of activation layers.
            Default: dict(type='ReLU').
-        loss_decode (dict): Config of decode loss.
+        loss_decode (dict, optional): Config of decode loss.
            Default: dict(type='CrossEntropyLoss').
-        ignore_index (int | None): The label index to be ignored. When using
-            masked BCE loss, ignore_index should be set to None. Default: 255.
+        ignore_index (int, optional): The label index to be ignored.
+            When using masked BCE loss, ignore_index should be set to None.
+            Default: 255.
    """

    def __init__(self,
@@ -110,9 +112,9 @@ class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
        """Compute semantic segmentation loss.

        Args:
-            seg_logit (torch.Tensor): Predicted per-point segmentation logits \
+            seg_logit (torch.Tensor): Predicted per-point segmentation logits
                of shape [B, num_classes, N].
-            seg_label (torch.Tensor): Ground-truth segmentation label of \
+            seg_label (torch.Tensor): Ground-truth segmentation label of
                shape [B, N].
        """
        loss = dict()

--- a/mmdet3d/models/decode_heads/dgcnn_head.py
+++ b/mmdet3d/models/decode_heads/dgcnn_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn.bricks import ConvModule
+
+from mmdet3d.ops import DGCNNFPModule
+from mmdet.models import HEADS
+from .decode_head import Base3DDecodeHead
+
+
+@HEADS.register_module()
+class DGCNNHead(Base3DDecodeHead):
+    r"""DGCNN decoder head.
+
+    Decoder head used in `DGCNN <https://arxiv.org/abs/1801.07829>`_.
+    Refer to the
+    `reimplementation code <https://github.com/AnTao97/dgcnn.pytorch>`_.
+
+    Args:
+        fp_channels (tuple[int], optional): Tuple of mlp channels in feature
+            propagation (FP) modules. Defaults to (1216, 512).
+    """
+
+    def __init__(self, fp_channels=(1216, 512), **kwargs):
+        super(DGCNNHead, self).__init__(**kwargs)
+
+        self.FP_module = DGCNNFPModule(
+            mlp_channels=fp_channels, act_cfg=self.act_cfg)
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1],
+            self.channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: points for decoder.
+        """
+        fa_points = feat_dict['fa_points']
+
+        return fa_points
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        fa_points = self._extract_input(feat_dict)
+
+        fp_points = self.FP_module(fa_points)
+        fp_points = fp_points.transpose(1, 2).contiguous()
+        output = self.pre_seg_conv(fp_points)
+        output = self.cls_seg(output)
+
+        return output
--- a/mmdet3d/models/decode_heads/paconv_head.py
+++ b/mmdet3d/models/decode_heads/paconv_head.py
@@ -14,7 +14,7 @@ class PAConvHead(PointNet2Head):

    Args:
        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        fp_norm_cfg (dict|None): Config of norm layers used in FP modules.
+        fp_norm_cfg (dict): Config of norm layers used in FP modules.
            Default: dict(type='BN2d').
    """


--- a/mmdet3d/models/decode_heads/pointnet2_head.py
+++ b/mmdet3d/models/decode_heads/pointnet2_head.py
@@ -16,7 +16,7 @@ class PointNet2Head(Base3DDecodeHead):

    Args:
        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        fp_norm_cfg (dict|None): Config of norm layers used in FP modules.
+        fp_norm_cfg (dict): Config of norm layers used in FP modules.
            Default: dict(type='BN2d').
    """


--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
@@ -7,8 +7,12 @@ from .centerpoint_head import CenterHead
 from .fcos_mono3d_head import FCOSMono3DHead
 from .free_anchor3d_head import FreeAnchor3DHead
 from .groupfree3d_head import GroupFree3DHead
+from .monoflex_head import MonoFlexHead
 from .parta2_rpn_head import PartA2RPNHead
+from .pgd_head import PGDHead
+from .point_rpn_head import PointRPNHead
 from .shape_aware_head import ShapeAwareHead
+from .smoke_mono3d_head import SMOKEMono3DHead
 from .ssd_3d_head import SSD3DHead
 from .vote_head import VoteHead

@@ -16,5 +20,6 @@ __all__ = [
    'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
-    'GroupFree3DHead'
+    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
+    'MonoFlexHead'
 ]
--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
@@ -51,15 +51,15 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
                     type='Anchor3DRangeGenerator',
                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
                     strides=[2],
-                     sizes=[[1.6, 3.9, 1.56]],
+                     sizes=[[3.9, 1.6, 1.56]],
                     rotations=[0, 1.57],
                     custom_values=[],
                     reshape_out=False),
                 assigner_per_size=False,
                 assign_per_class=False,
                 diff_rad_by_sin=True,
-                 dir_offset=0,
-                 dir_limit_offset=1,
+                 dir_offset=-np.pi / 2,
+                 dir_limit_offset=0,
                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
                 loss_cls=dict(
                     type='CrossEntropyLoss',
@@ -81,6 +81,10 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
        self.assign_per_class = assign_per_class
        self.dir_offset = dir_offset
        self.dir_limit_offset = dir_limit_offset
+        import warnings
+        warnings.warn(
+            'dir_offset and dir_limit_offset will be depressed and be '
+            'incorporated into box coder in the future')
        self.fp16_enabled = False

        # build anchor generator
@@ -145,7 +149,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            x (torch.Tensor): Input features.

        Returns:
-            tuple[torch.Tensor]: Contain score of each class, bbox \
+            tuple[torch.Tensor]: Contain score of each class, bbox
                regression and direction classification predictions.
        """
        cls_score = self.conv_cls(x)
@@ -163,7 +167,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
                features produced by FPN.

        Returns:
-            tuple[list[torch.Tensor]]: Multi-level class score, bbox \
+            tuple[list[torch.Tensor]]: Multi-level class score, bbox
                and direction predictions.
        """
        return multi_apply(self.forward_single, feats)
@@ -177,7 +181,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            device (str): device of current module.

        Returns:
-            list[list[torch.Tensor]]: Anchors of each image, valid flags \
+            list[list[torch.Tensor]]: Anchors of each image, valid flags
                of each image.
        """
        num_imgs = len(input_metas)
@@ -207,7 +211,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            num_total_samples (int): The number of valid samples.

        Returns:
-            tuple[torch.Tensor]: Losses of class, bbox \
+            tuple[torch.Tensor]: Losses of class, bbox
                and direction, respectively.
        """
        # classification loss
@@ -285,7 +289,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
                the 7th dimension is rotation dimension.

        Returns:
-            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
                dimensions are changed.
        """
        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
@@ -318,16 +322,16 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
                of each sample.
            gt_labels (list[torch.Tensor]): Gt labels of each sample.
            input_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (None | list[torch.Tensor]): Specify
-                which bounding.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding boxes to ignore.

        Returns:
-            dict[str, list[torch.Tensor]]: Classification, bbox, and \
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
                direction losses of each level.

                - loss_cls (list[torch.Tensor]): Classification losses.
                - loss_bbox (list[torch.Tensor]): Box regression losses.
-                - loss_dir (list[torch.Tensor]): Direction classification \
+                - loss_dir (list[torch.Tensor]): Direction classification
                    losses.
        """
        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
@@ -385,7 +389,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            dir_cls_preds (list[torch.Tensor]): Multi-level direction
                class predictions.
            input_metas (list[dict]): Contain pcd and img's meta info.
-            cfg (None | :obj:`ConfigDict`): Training or testing config.
+            cfg (:obj:`ConfigDict`): Training or testing config.
            rescale (list[torch.Tensor]): Whether th rescale bbox.

        Returns:
@@ -439,7 +443,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
                in single batch.
            input_meta (list[dict]): Contain pcd and img's meta info.
-            cfg (None | :obj:`ConfigDict`): Training or testing config.
+            cfg (:obj:`ConfigDict`): Training or testing config.
            rescale (list[torch.Tensor]): whether th rescale bbox.

        Returns:

--- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
 from abc import abstractmethod
+
+import torch
 from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
 from mmcv.runner import force_fp32
 from torch import nn as nn
@@ -18,35 +19,45 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        num_classes (int): Number of categories excluding the background
            category.
        in_channels (int): Number of channels in the input feature map.
-        feat_channels (int): Number of hidden channels. Used in child classes.
-        stacked_convs (int): Number of stacking convs of the head.
-        strides (tuple): Downsample factor of each feature map.
-        dcn_on_last_conv (bool): If true, use dcn in the last layer of
-            towers. Default: False.
-        conv_bias (bool | str): If specified as `auto`, it will be decided by
-            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
-            None, otherwise False. Default: "auto".
-        background_label (int | None): Label ID of background, set as 0 for
-            RPN and num_classes for other heads. It will automatically set as
-            num_classes if None is given.
-        use_direction_classifier (bool): Whether to add a direction classifier.
-        diff_rad_by_sin (bool): Whether to change the difference into sin
-            difference for box regression loss.
-        loss_cls (dict): Config of classification loss.
-        loss_bbox (dict): Config of localization loss.
-        loss_dir (dict): Config of direction classifier loss.
-        loss_attr (dict): Config of attribute classifier loss, which is only
-            active when pred_attrs=True.
-        bbox_code_size (int): Dimensions of predicted bounding boxes.
-        pred_attrs (bool): Whether to predict attributes. Default to False.
-        num_attrs (int): The number of attributes to be predicted. Default: 9.
-        pred_velo (bool): Whether to predict velocity. Default to False.
-        pred_bbox2d (bool): Whether to predict 2D boxes. Default to False.
-        group_reg_dims (tuple[int]): The dimension of each regression target
-            group. Default: (2, 1, 3, 1, 2).
-        cls_branch (tuple[int]): Channels for classification branch.
+        feat_channels (int, optional): Number of hidden channels.
+            Used in child classes. Defaults to 256.
+        stacked_convs (int, optional): Number of stacking convs of the head.
+        strides (tuple, optional): Downsample factor of each feature map.
+        dcn_on_last_conv (bool, optional): If true, use dcn in the last
+            layer of towers. Default: False.
+        conv_bias (bool | str, optional): If specified as `auto`, it will be
+            decided by the norm_cfg. Bias of conv will be set as True
+            if `norm_cfg` is None, otherwise False. Default: 'auto'.
+        background_label (int, optional): Label ID of background,
+            set as 0 for RPN and num_classes for other heads.
+            It will automatically set as `num_classes` if None is given.
+        use_direction_classifier (bool, optional):
+            Whether to add a direction classifier.
+        diff_rad_by_sin (bool, optional): Whether to change the difference
+            into sin difference for box regression loss. Defaults to True.
+        dir_offset (float, optional): Parameter used in direction
+            classification. Defaults to 0.
+        dir_limit_offset (float, optional): Parameter used in direction
+            classification. Defaults to 0.
+        loss_cls (dict, optional): Config of classification loss.
+        loss_bbox (dict, optional): Config of localization loss.
+        loss_dir (dict, optional): Config of direction classifier loss.
+        loss_attr (dict, optional): Config of attribute classifier loss,
+            which is only active when `pred_attrs=True`.
+        bbox_code_size (int, optional): Dimensions of predicted bounding boxes.
+        pred_attrs (bool, optional): Whether to predict attributes.
+            Defaults to False.
+        num_attrs (int, optional): The number of attributes to be predicted.
+            Default: 9.
+        pred_velo (bool, optional): Whether to predict velocity.
+            Defaults to False.
+        pred_bbox2d (bool, optional): Whether to predict 2D boxes.
+            Defaults to False.
+        group_reg_dims (tuple[int], optional): The dimension of each regression
+            target group. Default: (2, 1, 3, 1, 2).
+        cls_branch (tuple[int], optional): Channels for classification branch.
            Default: (128, 64).
-        reg_branch (tuple[tuple]): Channels for regression branch.
+        reg_branch (tuple[tuple], optional): Channels for regression branch.
            Default: (
                (128, 64),  # offset
                (128, 64),  # depth
@@ -54,14 +65,16 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
                (64, ),  # rot
                ()  # velo
            ),
-        dir_branch (tuple[int]): Channels for direction classification branch.
-            Default: (64, ).
-        attr_branch (tuple[int]): Channels for classification branch.
+        dir_branch (tuple[int], optional): Channels for direction
+            classification branch. Default: (64, ).
+        attr_branch (tuple[int], optional): Channels for classification branch.
            Default: (64, ).
-        conv_cfg (dict): Config dict for convolution layer. Default: None.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        train_cfg (dict): Training config of anchor head.
-        test_cfg (dict): Testing config of anchor head.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        train_cfg (dict, optional): Training config of anchor head.
+        test_cfg (dict, optional): Testing config of anchor head.
    """  # noqa: W605

    _version = 1
@@ -79,6 +92,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
            use_direction_classifier=True,
            diff_rad_by_sin=True,
            dir_offset=0,
+            dir_limit_offset=0,
            loss_cls=dict(
                type='FocalLoss',
                use_sigmoid=True,
@@ -125,6 +139,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        self.use_direction_classifier = use_direction_classifier
        self.diff_rad_by_sin = diff_rad_by_sin
        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)
        self.loss_dir = build_loss(loss_dir)
@@ -162,13 +177,6 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
            self.attr_branch = attr_branch

        self._init_layers()
-        if init_cfg is None:
-            self.init_cfg = dict(
-                type='Normal',
-                layer='Conv2d',
-                std=0.01,
-                override=dict(
-                    type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))

    def _init_layers(self):
        """Initialize layers of the head."""
@@ -274,8 +282,34 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
            self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)

    def init_weights(self):
-        super().init_weights()
+        """Initialize weights of the head.
+
+        We currently still use the customized defined init_weights because the
+        default init of DCN triggered by the init_cfg will init
+        conv_offset.weight, which mistakenly affects the training stability.
+        """
+        for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]:
+            for m in modules:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        for conv_reg_prev in self.conv_reg_prevs:
+            if conv_reg_prev is None:
+                continue
+            for m in conv_reg_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        if self.use_direction_classifier:
+            for m in self.conv_dir_cls_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        if self.pred_attrs:
+            for m in self.conv_attr_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        for conv_reg in self.conv_regs:
+            normal_init(conv_reg, std=0.01)
        if self.use_direction_classifier:
            normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
        if self.pred_attrs:
@@ -289,7 +323,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
                a 4D-tensor.

        Returns:
-            tuple: Usually contain classification scores, bbox predictions, \
+            tuple: Usually contain classification scores, bbox predictions,
                and direction class predictions.
                cls_scores (list[Tensor]): Box scores for each scale level,
                    each is a 4D-tensor, the channel number is
@@ -307,7 +341,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
        return multi_apply(self.forward_single, feats)[:5]

    def forward_single(self, x):
-        """Forward features of a single scale levle.
+        """Forward features of a single scale level.

        Args:
            x (Tensor): FPN feature maps of the specified stride.
@@ -401,7 +435,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead):
                corresponding to each box
            img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+            gt_bboxes_ignore (list[Tensor]): specify which bounding
                boxes can be ignored when computing the loss.
        """


--- a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
+
 from mmcv.runner import BaseModule



--- a/mmdet3d/models/dense_heads/centerpoint_head.py
+++ b/mmdet3d/models/dense_heads/centerpoint_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+
 import torch
 from mmcv.cnn import ConvModule, build_conv_layer
 from mmcv.runner import BaseModule, force_fp32
@@ -21,16 +22,16 @@ class SeparateHead(BaseModule):
    Args:
        in_channels (int): Input channels for conv_layer.
        heads (dict): Conv information.
-        head_conv (int): Output channels.
+        head_conv (int, optional): Output channels.
            Default: 64.
-        final_kernal (int): Kernal size for the last conv layer.
-            Deafult: 1.
-        init_bias (float): Initial bias. Default: -2.19.
-        conv_cfg (dict): Config of conv layer.
+        final_kernel (int, optional): Kernel size for the last conv layer.
+            Default: 1.
+        init_bias (float, optional): Initial bias. Default: -2.19.
+        conv_cfg (dict, optional): Config of conv layer.
            Default: dict(type='Conv2d')
-        norm_cfg (dict): Config of norm layer.
+        norm_cfg (dict, optional): Config of norm layer.
            Default: dict(type='BN2d').
-        bias (str): Type of bias. Default: 'auto'.
+        bias (str, optional): Type of bias. Default: 'auto'.
    """

    def __init__(self,
@@ -100,17 +101,17 @@ class SeparateHead(BaseModule):
        Returns:
            dict[str: torch.Tensor]: contains the following keys:

-                -reg （torch.Tensor): 2D regression value with the \
+                -reg （torch.Tensor): 2D regression value with the
                    shape of [B, 2, H, W].
-                -height (torch.Tensor): Height value with the \
+                -height (torch.Tensor): Height value with the
                    shape of [B, 1, H, W].
-                -dim (torch.Tensor): Size value with the shape \
+                -dim (torch.Tensor): Size value with the shape
                    of [B, 3, H, W].
-                -rot (torch.Tensor): Rotation value with the \
+                -rot (torch.Tensor): Rotation value with the
                    shape of [B, 2, H, W].
-                -vel (torch.Tensor): Velocity value with the \
+                -vel (torch.Tensor): Velocity value with the
                    shape of [B, 2, H, W].
-                -heatmap (torch.Tensor): Heatmap with the shape of \
+                -heatmap (torch.Tensor): Heatmap with the shape of
                    [B, N, H, W].
        """
        ret_dict = dict()
@@ -131,18 +132,19 @@ class DCNSeparateHead(BaseModule):

    Args:
        in_channels (int): Input channels for conv_layer.
+        num_cls (int): Number of classes.
        heads (dict): Conv information.
        dcn_config (dict): Config of dcn layer.
-        num_cls (int): Output channels.
+        head_conv (int, optional): Output channels.
            Default: 64.
-        final_kernal (int): Kernal size for the last conv layer.
-            Deafult: 1.
-        init_bias (float): Initial bias. Default: -2.19.
-        conv_cfg (dict): Config of conv layer.
+        final_kernel (int, optional): Kernel size for the last conv
+            layer. Default: 1.
+        init_bias (float, optional): Initial bias. Default: -2.19.
+        conv_cfg (dict, optional): Config of conv layer.
            Default: dict(type='Conv2d')
-        norm_cfg (dict): Config of norm layer.
+        norm_cfg (dict, optional): Config of norm layer.
            Default: dict(type='BN2d').
-        bias (str): Type of bias. Default: 'auto'.
+        bias (str, optional): Type of bias. Default: 'auto'.
    """  # noqa: W605

    def __init__(self,
@@ -215,17 +217,17 @@ class DCNSeparateHead(BaseModule):
        Returns:
            dict[str: torch.Tensor]: contains the following keys:

-                -reg （torch.Tensor): 2D regression value with the \
+                -reg （torch.Tensor): 2D regression value with the
                    shape of [B, 2, H, W].
-                -height (torch.Tensor): Height value with the \
+                -height (torch.Tensor): Height value with the
                    shape of [B, 1, H, W].
-                -dim (torch.Tensor): Size value with the shape \
+                -dim (torch.Tensor): Size value with the shape
                    of [B, 3, H, W].
-                -rot (torch.Tensor): Rotation value with the \
+                -rot (torch.Tensor): Rotation value with the
                    shape of [B, 2, H, W].
-                -vel (torch.Tensor): Velocity value with the \
+                -vel (torch.Tensor): Velocity value with the
                    shape of [B, 2, H, W].
-                -heatmap (torch.Tensor): Heatmap with the shape of \
+                -heatmap (torch.Tensor): Heatmap with the shape of
                    [B, N, H, W].
        """
        center_feat = self.feature_adapt_cls(x)
@@ -243,31 +245,30 @@ class CenterHead(BaseModule):
    """CenterHead for CenterPoint.

    Args:
-        mode (str): Mode of the head. Default: '3d'.
-        in_channels (list[int] | int): Channels of the input feature map.
-            Default: [128].
-        tasks (list[dict]): Task information including class number
+        in_channels (list[int] | int, optional): Channels of the input
+            feature map. Default: [128].
+        tasks (list[dict], optional): Task information including class number
            and class names. Default: None.
-        dataset (str): Name of the dataset. Default: 'nuscenes'.
-        weight (float): Weight for location loss. Default: 0.25.
-        code_weights (list[int]): Code weights for location loss. Default: [].
-        common_heads (dict): Conv information for common heads.
+        train_cfg (dict, optional): Train-time configs. Default: None.
+        test_cfg (dict, optional): Test-time configs. Default: None.
+        bbox_coder (dict, optional): Bbox coder configs. Default: None.
+        common_heads (dict, optional): Conv information for common heads.
            Default: dict().
-        loss_cls (dict): Config of classification loss function.
+        loss_cls (dict, optional): Config of classification loss function.
            Default: dict(type='GaussianFocalLoss', reduction='mean').
-        loss_bbox (dict): Config of regression loss function.
+        loss_bbox (dict, optional): Config of regression loss function.
            Default: dict(type='L1Loss', reduction='none').
-        separate_head (dict): Config of separate head. Default: dict(
+        separate_head (dict, optional): Config of separate head. Default: dict(
            type='SeparateHead', init_bias=-2.19, final_kernel=3)
-        share_conv_channel (int): Output channels for share_conv_layer.
-            Default: 64.
-        num_heatmap_convs (int): Number of conv layers for heatmap conv layer.
-            Default: 2.
-        conv_cfg (dict): Config of conv layer.
+        share_conv_channel (int, optional): Output channels for share_conv
+            layer. Default: 64.
+        num_heatmap_convs (int, optional): Number of conv layers for heatmap
+            conv layer. Default: 2.
+        conv_cfg (dict, optional): Config of conv layer.
            Default: dict(type='Conv2d')
-        norm_cfg (dict): Config of norm layer.
+        norm_cfg (dict, optional): Config of norm layer.
            Default: dict(type='BN2d').
-        bias (str): Type of bias. Default: 'auto'.
+        bias (str, optional): Type of bias. Default: 'auto'.
    """

    def __init__(self,
@@ -366,8 +367,8 @@ class CenterHead(BaseModule):
            feat (torch.tensor): Feature map with the shape of [B, H*W, 10].
            ind (torch.Tensor): Index of the ground truth boxes with the
                shape of [B, max_obj].
-            mask (torch.Tensor): Mask of the feature map with the shape
-                of [B, max_obj]. Default: None.
+            mask (torch.Tensor, optional): Mask of the feature map with the
+                shape of [B, max_obj]. Default: None.

        Returns:
            torch.Tensor: Feature map after gathering with the shape
@@ -403,14 +404,14 @@ class CenterHead(BaseModule):

        Returns:
            Returns:
-                tuple[list[torch.Tensor]]: Tuple of target including \
+                tuple[list[torch.Tensor]]: Tuple of target including
                    the following results in order.

                    - list[torch.Tensor]: Heatmap scores.
                    - list[torch.Tensor]: Ground truth boxes.
-                    - list[torch.Tensor]: Indexes indicating the \
+                    - list[torch.Tensor]: Indexes indicating the
                        position of the valid boxes.
-                    - list[torch.Tensor]: Masks indicating which \
+                    - list[torch.Tensor]: Masks indicating which
                        boxes are valid.
        """
        heatmaps, anno_boxes, inds, masks = multi_apply(
@@ -437,14 +438,14 @@ class CenterHead(BaseModule):
            gt_labels_3d (torch.Tensor): Labels of boxes.

        Returns:
-            tuple[list[torch.Tensor]]: Tuple of target including \
+            tuple[list[torch.Tensor]]: Tuple of target including
                the following results in order.

                - list[torch.Tensor]: Heatmap scores.
                - list[torch.Tensor]: Ground truth boxes.
-                - list[torch.Tensor]: Indexes indicating the position \
+                - list[torch.Tensor]: Indexes indicating the position
                    of the valid boxes.
-                - list[torch.Tensor]: Masks indicating which boxes \
+                - list[torch.Tensor]: Masks indicating which boxes
                    are valid.
        """
        device = gt_labels_3d.device
@@ -728,11 +729,11 @@ class CenterHead(BaseModule):
        Returns:
            list[dict[str: torch.Tensor]]: contains the following keys:

-                -bboxes (torch.Tensor): Prediction bboxes after nms with the \
+                -bboxes (torch.Tensor): Prediction bboxes after nms with the
                    shape of [N, 9].
-                -scores (torch.Tensor): Prediction scores after nms with the \
+                -scores (torch.Tensor): Prediction scores after nms with the
                    shape of [N].
-                -labels (torch.Tensor): Prediction labels after nms with the \
+                -labels (torch.Tensor): Prediction labels after nms with the
                    shape of [N].
        """
        predictions_dicts = []
@@ -781,7 +782,7 @@ class CenterHead(BaseModule):
                    boxes_for_nms,
                    top_scores,
                    thresh=self.test_cfg['nms_thr'],
-                    pre_maxsize=self.test_cfg['pre_max_size'],
+                    pre_max_size=self.test_cfg['pre_max_size'],
                    post_max_size=self.test_cfg['post_max_size'])
            else:
                selected = []

--- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+
 import numpy as np
 import torch
-from mmcv.cnn import Scale
+from mmcv.cnn import Scale, normal_init
 from mmcv.runner import force_fp32
 from torch import nn as nn

-from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
+from mmdet3d.core import (box3d_multiclass_nms, limit_period, points_img2cam,
+                          xywhr2xyxyr)
 from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
 from mmdet.models.builder import HEADS, build_loss
 from .anchor_free_mono3d_head import AnchorFreeMono3DHead

@@ -21,31 +25,29 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
        num_classes (int): Number of categories excluding the background
            category.
        in_channels (int): Number of channels in the input feature map.
-        regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
+        regress_ranges (tuple[tuple[int, int]], optional): Regress range of multiple
            level points.
-        center_sampling (bool): If true, use center sampling. Default: True.
-        center_sample_radius (float): Radius of center sampling. Default: 1.5.
-        norm_on_bbox (bool): If true, normalize the regression targets
+        center_sampling (bool, optional): If true, use center sampling. Default: True.
+        center_sample_radius (float, optional): Radius of center sampling. Default: 1.5.
+        norm_on_bbox (bool, optional): If true, normalize the regression targets
            with FPN strides. Default: True.
-        centerness_on_reg (bool): If true, position centerness on the
+        centerness_on_reg (bool, optional): If true, position centerness on the
            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
            Default: True.
-        centerness_alpha: Parameter used to adjust the intensity attenuation
-            from the center to the periphery. Default: 2.5.
-        loss_cls (dict): Config of classification loss.
-        loss_bbox (dict): Config of localization loss.
-        loss_dir (dict): Config of direction classification loss.
-        loss_attr (dict): Config of attribute classification loss.
-        loss_centerness (dict): Config of centerness loss.
-        norm_cfg (dict): dictionary to construct and config norm layer.
+        centerness_alpha (int, optional): Parameter used to adjust the intensity
+            attenuation from the center to the periphery. Default: 2.5.
+        loss_cls (dict, optional): Config of classification loss.
+        loss_bbox (dict, optional): Config of localization loss.
+        loss_dir (dict, optional): Config of direction classification loss.
+        loss_attr (dict, optional): Config of attribute classification loss.
+        loss_centerness (dict, optional): Config of centerness loss.
+        norm_cfg (dict, optional): dictionary to construct and config norm layer.
            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
-        centerness_branch (tuple[int]): Channels for centerness branch.
+        centerness_branch (tuple[int], optional): Channels for centerness branch.
            Default: (64, ).
    """  # noqa: E501

    def __init__(self,
-                 num_classes,
-                 in_channels,
                 regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384),
                                 (384, INF)),
                 center_sampling=True,
@@ -73,6 +75,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
                     type='CrossEntropyLoss',
                     use_sigmoid=True,
                     loss_weight=1.0),
+                 bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
                 centerness_branch=(64, ),
                 init_cfg=None,
@@ -85,8 +88,6 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
        self.centerness_alpha = centerness_alpha
        self.centerness_branch = centerness_branch
        super().__init__(
-            num_classes,
-            in_channels,
            loss_cls=loss_cls,
            loss_bbox=loss_bbox,
            loss_dir=loss_dir,
@@ -95,13 +96,8 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            init_cfg=init_cfg,
            **kwargs)
        self.loss_centerness = build_loss(loss_centerness)
-        if init_cfg is None:
-            self.init_cfg = dict(
-                type='Normal',
-                layer='Conv2d',
-                std=0.01,
-                override=dict(
-                    type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+        bbox_coder['code_size'] = self.bbox_code_size
+        self.bbox_coder = build_bbox_coder(bbox_coder)

    def _init_layers(self):
        """Initialize layers of the head."""
@@ -110,9 +106,24 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            conv_channels=self.centerness_branch,
            conv_strides=(1, ) * len(self.centerness_branch))
        self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
+        self.scale_dim = 3  # only for offset, depth and size regression
        self.scales = nn.ModuleList([
-            nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides
-        ])  # only for offset, depth and size regression
+            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
+            for _ in self.strides
+        ])
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized init_weights because the default
+        init of DCN triggered by the init_cfg will init conv_offset.weight,
+        which mistakenly affects the training stability.
+        """
+        super().init_weights()
+        for m in self.conv_centerness_prev:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        normal_init(self.conv_centerness, std=0.01)

    def forward(self, feats):
        """Forward features from the upstream network.
@@ -138,11 +149,12 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
                centernesses (list[Tensor]): Centerness for each scale level,
                    each is a 4D-tensor, the channel number is num_points * 1.
        """
+        # Note: we use [:5] to filter feats and only return predictions
        return multi_apply(self.forward_single, feats, self.scales,
-                           self.strides)
+                           self.strides)[:5]

    def forward_single(self, x, scale, stride):
-        """Forward features of a single scale levle.
+        """Forward features of a single scale level.

        Args:
            x (Tensor): FPN feature maps of the specified stride.
@@ -153,7 +165,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
                is True.

        Returns:
-            tuple: scores for each class, bbox and direction class \
+            tuple: scores for each class, bbox and direction class
                predictions, centerness predictions of input feature maps.
        """
        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
@@ -169,26 +181,12 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            for conv_centerness_prev_layer in self.conv_centerness_prev:
                clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
            centerness = self.conv_centerness(clone_cls_feat)
-        # scale the bbox_pred of different level
-        # only apply to offset, depth and size prediction
-        scale_offset, scale_depth, scale_size = scale[0:3]

-        clone_bbox_pred = bbox_pred.clone()
-        bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float()
-        bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float()
-        bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float()
+        bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride,
+                                           self.training, cls_score)

-        bbox_pred[:, 2] = bbox_pred[:, 2].exp()
-        bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6  # avoid size=0
-
-        assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
-            'has not been thoroughly tested for FCOS3D.'
-        if self.norm_on_bbox:
-            if not self.training:
-                # Note that this line is conducted only when testing
-                bbox_pred[:, :2] *= stride
-
-        return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness
+        return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
+            cls_feat, reg_feat

    @staticmethod
    def add_sin_difference(boxes1, boxes2):
@@ -201,7 +199,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
                the 7th dimension is rotation dimension.

        Returns:
-            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
                dimensions are changed.
        """
        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
@@ -217,21 +215,27 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
    @staticmethod
    def get_direction_target(reg_targets,
                             dir_offset=0,
+                             dir_limit_offset=0.0,
                             num_bins=2,
                             one_hot=True):
        """Encode direction to 0 ~ num_bins-1.

        Args:
            reg_targets (torch.Tensor): Bbox regression targets.
-            dir_offset (int): Direction offset.
-            num_bins (int): Number of bins to divide 2*PI.
-            one_hot (bool): Whether to encode as one hot.
+            dir_offset (int, optional): Direction offset. Default to 0.
+            dir_limit_offset (float, optional): Offset to set the direction
+                range. Default to 0.0.
+            num_bins (int, optional): Number of bins to divide 2*PI.
+                Default to 2.
+            one_hot (bool, optional): Whether to encode as one hot.
+                Default to True.

        Returns:
            torch.Tensor: Encoded direction targets.
        """
        rot_gt = reg_targets[..., 6]
-        offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi)
+        offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset,
+                                  2 * np.pi)
        dir_cls_targets = torch.floor(offset_rot /
                                      (2 * np.pi / num_bins)).long()
        dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
@@ -293,7 +297,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            attr_labels (list[Tensor]): Attributes indices of each box.
            img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
-            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+            gt_bboxes_ignore (list[Tensor]): specify which bounding
                boxes can be ignored when computing the loss.

        Returns:
@@ -377,7 +381,10 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):

            if self.use_direction_classifier:
                pos_dir_cls_targets = self.get_direction_target(
-                    pos_bbox_targets_3d, self.dir_offset, one_hot=False)
+                    pos_bbox_targets_3d,
+                    self.dir_offset,
+                    self.dir_limit_offset,
+                    one_hot=False)

            if self.diff_rad_by_sin:
                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
@@ -502,11 +509,11 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            rescale (bool): If True, return boxes in original image space

        Returns:
-            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
-                The first item is an (n, 5) tensor, where the first 4 columns \
-                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
-                5-th column is a score between 0 and 1. The second item is a \
-                (n,) tensor where each item is the predicted class label of \
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
                the corresponding box.
        """
        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
@@ -575,7 +582,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
                level with shape (num_points * bbox_code_size, H, W).
            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on a single scale level with shape \
+                predictions on a single scale level with shape
                (num_points * 2, H, W)
            attr_preds (list[Tensor]): Attribute scores for each scale level
                Has shape (N, num_points * num_attrs, H, W)
@@ -634,7 +641,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
            if rescale:
                bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor)
            pred_center2d = bbox_pred[:, :3].clone()
-            bbox_pred[:, :3] = self.pts2Dto3D(bbox_pred[:, :3], view)
+            bbox_pred[:, :3] = points_img2cam(bbox_pred[:, :3], view)
            mlvl_centers2d.append(pred_center2d)
            mlvl_bboxes.append(bbox_pred)
            mlvl_scores.append(scores)
@@ -647,19 +654,13 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
        mlvl_dir_scores = torch.cat(mlvl_dir_scores)

        # change local yaw to global yaw for 3D nms
-        if mlvl_bboxes.shape[0] > 0:
-            dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0,
-                                   np.pi)
-            mlvl_bboxes[..., 6] = (
-                dir_rot + self.dir_offset +
-                np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype))
-
-        cam_intrinsic = mlvl_centers2d.new_zeros((4, 4))
-        cam_intrinsic[:view.shape[0], :view.shape[1]] = \
+        cam2img = mlvl_centers2d.new_zeros((4, 4))
+        cam2img[:view.shape[0], :view.shape[1]] = \
            mlvl_centers2d.new_tensor(view)
-        mlvl_bboxes[:, 6] = torch.atan2(
-            mlvl_centers2d[:, 0] - cam_intrinsic[0, 2],
-            cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6]
+        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
+                                                 mlvl_dir_scores,
+                                                 self.dir_offset, cam2img)
+
        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
            mlvl_bboxes, box_dim=self.bbox_code_size,
            origin=(0.5, 0.5, 0.5)).bev)
@@ -695,14 +696,18 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
    def pts2Dto3D(points, view):
        """
        Args:
-            points (torch.Tensor): points in 2D images, [N, 3], \
+            points (torch.Tensor): points in 2D images, [N, 3],
                3 corresponds with x, y in the image and depth.
-            view (np.ndarray): camera instrinsic, [3, 3]
+            view (np.ndarray): camera intrinsic, [3, 3]

        Returns:
-            torch.Tensor: points in 3D space. [N, 3], \
+            torch.Tensor: points in 3D space. [N, 3],
                3 corresponds with x, y, z in 3D space.
        """
+        warning.warn('DeprecationWarning: This static method has been moved '
+                     'out of this class to mmdet3d/core. The function '
+                     'pts2Dto3D will be deprecated.')
+
        assert view.shape[0] <= 4
        assert view.shape[1] <= 4
        assert points.shape[1] == 3
@@ -715,7 +720,7 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):
        viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view)
        inv_viewpad = torch.inverse(viewpad).transpose(0, 1)

-        # Do operation in homogenous coordinates.
+        # Do operation in homogeneous coordinates.
        nbr_points = unnorm_points2D.shape[0]
        homo_points2D = torch.cat(
            [unnorm_points2D,
@@ -762,8 +767,8 @@ class FCOSMono3DHead(AnchorFreeMono3DHead):

        Returns:
            tuple:
-                concat_lvl_labels (list[Tensor]): Labels of each level. \
-                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+                concat_lvl_labels (list[Tensor]): Labels of each level.
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each
                    level.
        """
        assert len(points) == len(self.regress_ranges)

--- a/mmdet3d/models/dense_heads/free_anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/free_anchor3d_head.py
@@ -195,6 +195,7 @@ class FreeAnchor3DHead(Anchor3DHead):
                    matched_anchors,
                    matched_object_targets,
                    self.dir_offset,
+                    self.dir_limit_offset,
                    one_hot=False)
                loss_dir = self.loss_dir(
                    dir_cls_preds_[matched].transpose(-2, -1),

--- a/mmdet3d/models/dense_heads/groupfree3d_head.py
+++ b/mmdet3d/models/dense_heads/groupfree3d_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+
 import numpy as np
 import torch
 from mmcv import ConfigDict
@@ -25,13 +26,13 @@ class PointsObjClsModule(BaseModule):

    Args:
        in_channel (int): number of channels of seed point features.
-        num_convs (int): number of conv layers.
+        num_convs (int, optional): number of conv layers.
            Default: 3.
-        conv_cfg (dict): Config of convolution.
+        conv_cfg (dict, optional): Config of convolution.
            Default: dict(type='Conv1d').
-        norm_cfg (dict): Config of normalization.
+        norm_cfg (dict, optional): Config of normalization.
            Default: dict(type='BN1d').
-        act_cfg (dict): Config of activation.
+        act_cfg (dict, optional): Config of activation.
            Default: dict(type='ReLU').
    """

@@ -299,7 +300,7 @@ class GroupFree3DHead(BaseModule):
        """Forward pass.

        Note:
-            The forward of GroupFree3DHead is devided into 2 steps:
+            The forward of GroupFree3DHead is divided into 2 steps:

                1. Initial object candidates sampling.
                2. Iterative object box prediction by transformer decoder.
@@ -405,15 +406,15 @@ class GroupFree3DHead(BaseModule):
        Args:
            bbox_preds (dict): Predictions from forward of vote head.
            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
                bboxes of each sample.
            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (None | list[torch.Tensor]): Point-wise
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
                semantic mask.
-            pts_instance_mask (None | list[torch.Tensor]): Point-wise
+            pts_instance_mask (list[torch.Tensor]): Point-wise
                instance mask.
            img_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (None | list[torch.Tensor]): Specify
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
                which bounding.
            ret_target (Bool): Return targets or not.

@@ -545,12 +546,12 @@ class GroupFree3DHead(BaseModule):

        Args:
            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
                bboxes of each batch.
            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-            pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic
+            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
                label of each batch.
-            pts_instance_mask (None | list[torch.Tensor]): Point-wise instance
+            pts_instance_mask (list[torch.Tensor]): Point-wise instance
                label of each batch.
            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
            max_gt_num (int): Max number of GTs for single batch.
@@ -657,12 +658,12 @@ class GroupFree3DHead(BaseModule):

        Args:
            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
                boxes of each batch.
            gt_labels_3d (torch.Tensor): Labels of each batch.
-            pts_semantic_mask (None | torch.Tensor): Point-wise semantic
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
                label of each batch.
-            pts_instance_mask (None | torch.Tensor): Point-wise instance
+            pts_instance_mask (torch.Tensor): Point-wise instance
                label of each batch.
            max_gt_nums (int): Max number of GTs for single batch.
            seed_points (torch.Tensor): Coordinates of seed points.
@@ -710,7 +711,7 @@ class GroupFree3DHead(BaseModule):
        if self.bbox_coder.with_rot:
            vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed])
            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
-            box_indices_all = gt_bboxes_3d.points_in_boxes(points)
+            box_indices_all = gt_bboxes_3d.points_in_boxes_part(points)
            for i in range(gt_labels_3d.shape[0]):
                box_indices = box_indices_all[:, i]
                indices = torch.nonzero(
@@ -880,7 +881,7 @@ class GroupFree3DHead(BaseModule):
        Returns:
            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
        """
-        # support multi-stage predicitons
+        # support multi-stage predictions
        assert self.test_cfg['prediction_stages'] in \
            ['last', 'all', 'last_three']

@@ -951,7 +952,7 @@ class GroupFree3DHead(BaseModule):
            box_dim=bbox.shape[-1],
            with_yaw=self.bbox_coder.with_rot,
            origin=(0.5, 0.5, 0.5))
-        box_indices = bbox.points_in_boxes(points)
+        box_indices = bbox.points_in_boxes_all(points)

        corner3d = bbox.corners
        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))

--- a/mmdet3d/models/dense_heads/monoflex_head.py
+++ b/mmdet3d/models/dense_heads/monoflex_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import xavier_init
+from torch import nn as nn
+
+from mmdet3d.core.utils import get_ellip_gaussian_2D
+from mmdet3d.models.model_utils import EdgeFusionModule
+from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
+                                  get_keypoints, handle_proj_objs)
+from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import gaussian_radius, gen_gaussian_target
+from mmdet.models.utils.gaussian_target import (get_local_maximum,
+                                                get_topk_from_heatmap,
+                                                transpose_and_gather_feat)
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+
+@HEADS.register_module()
+class MonoFlexHead(AnchorFreeMono3DHead):
+    r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
+
+    .. code-block:: none
+
+                / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+        feature
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->   3d dimensions
+                |
+                |                  |--- 1 x 1 conv -->  ori cls
+                | --> 3 x 3 conv --|
+                |                  |--- 1 x 1 conv -->  ori offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  depth
+                |
+                \ --> 3 x 3 conv --> 1 x 1 conv -->  depth uncertainty
+
+    Args:
+        use_edge_fusion (bool): Whether to use edge fusion module while
+            feature extraction.
+        edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
+        edge_heatmap_ratio (float): Ratio of generating target heatmap.
+        filter_outside_objs (bool, optional): Whether to filter the
+            outside objects. Default: True.
+        loss_cls (dict, optional): Config of classification loss.
+            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
+        loss_bbox (dict, optional): Config of localization loss.
+            Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
+        loss_dir (dict, optional): Config of direction classification loss.
+            Default: dict(type='MultibinLoss', loss_weight=0.1).
+        loss_keypoints (dict, optional): Config of keypoints loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_dims: (dict, optional): Config of dimensions loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_offsets2d: (dict, optional): Config of offsets2d loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_direct_depth: (dict, optional): Config of directly regression depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_combined_depth: (dict, optional): Config of combined depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_attr (dict, optional): Config of attribute classification loss.
+            In MonoFlex, Default: None.
+        bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
+            Default: dict(type='MonoFlexCoder', code_size=7).
+        norm_cfg (dict, optional): Dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict): Initialization config dict. Default: None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 use_edge_fusion,
+                 edge_fusion_inds,
+                 edge_heatmap_ratio,
+                 filter_outside_objs=True,
+                 loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=0.1),
+                 loss_dir=dict(type='MultiBinLoss', loss_weight=0.1),
+                 loss_keypoints=dict(type='L1Loss', loss_weight=0.1),
+                 loss_dims=dict(type='L1Loss', loss_weight=0.1),
+                 loss_offsets2d=dict(type='L1Loss', loss_weight=0.1),
+                 loss_direct_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_combined_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_attr=None,
+                 bbox_coder=dict(type='MonoFlexCoder', code_size=7),
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None,
+                 init_bias=-2.19,
+                 **kwargs):
+        self.use_edge_fusion = use_edge_fusion
+        self.edge_fusion_inds = edge_fusion_inds
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.filter_outside_objs = filter_outside_objs
+        self.edge_heatmap_ratio = edge_heatmap_ratio
+        self.init_bias = init_bias
+        self.loss_dir = build_loss(loss_dir)
+        self.loss_keypoints = build_loss(loss_keypoints)
+        self.loss_dims = build_loss(loss_dims)
+        self.loss_offsets2d = build_loss(loss_offsets2d)
+        self.loss_direct_depth = build_loss(loss_direct_depth)
+        self.loss_keypoints_depth = build_loss(loss_keypoints_depth)
+        self.loss_combined_depth = build_loss(loss_combined_depth)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+    def _init_edge_module(self):
+        """Initialize edge fusion module for feature extraction."""
+        self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256)
+        for i in range(len(self.edge_fusion_inds)):
+            reg_inds, out_inds = self.edge_fusion_inds[i]
+            out_channels = self.group_reg_dims[reg_inds][out_inds]
+            fusion_layer = EdgeFusionModule(out_channels, 256)
+            layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}'
+            self.add_module(layer_name, fusion_layer)
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        self.conv_cls.bias.data.fill_(self.init_bias)
+        xavier_init(self.conv_regs[4][0], gain=0.01)
+        xavier_init(self.conv_regs[7][0], gain=0.01)
+        for m in self.conv_regs.modules():
+            if isinstance(m, nn.Conv2d):
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls_prev = self._init_branch(
+            conv_channels=self.cls_branch,
+            conv_strides=(1, ) * len(self.cls_branch))
+        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
+                                  1)
+        # init regression head
+        self.conv_reg_prevs = nn.ModuleList()
+        # init output head
+        self.conv_regs = nn.ModuleList()
+        # group_reg_dims:
+        # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
+        for i in range(len(self.group_reg_dims)):
+            reg_dims = self.group_reg_dims[i]
+            reg_branch_channels = self.reg_branch[i]
+            out_channel = self.out_channels[i]
+            reg_list = nn.ModuleList()
+            if len(reg_branch_channels) > 0:
+                self.conv_reg_prevs.append(
+                    self._init_branch(
+                        conv_channels=reg_branch_channels,
+                        conv_strides=(1, ) * len(reg_branch_channels)))
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(out_channel, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+            else:
+                self.conv_reg_prevs.append(None)
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_predictor()
+        if self.use_edge_fusion:
+            self._init_edge_module()
+
+    def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d,
+                      gt_labels_3d, centers2d, depths, attr_labels,
+                      gt_bboxes_ignore, proposal_cfg, **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
+                shape (num_gts, self.bbox_code_size).
+            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
+                shape (num_gts,).
+            centers2d (list[Tensor]): Projected 3D center of each box,
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth of projected 3D center of each box,
+                shape (num_gts,).
+            attr_labels (list[Tensor]): Attribute labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x, input_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
+                                  attr_labels, input_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
+                                  gt_labels_3d, centers2d, depths, attr_labels,
+                                  input_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(
+                *outs, input_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    def forward(self, feats, input_metas):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (list[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+        """
+        mlvl_input_metas = [input_metas for i in range(len(feats))]
+        return multi_apply(self.forward_single, feats, mlvl_input_metas)
+
+    def forward_single(self, x, input_metas):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): Feature maps from a specific FPN feature level.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions.
+        """
+        img_h, img_w = input_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = x.shape
+        downsample_ratio = img_h / feat_h
+
+        for conv_cls_prev_layer in self.conv_cls_prev:
+            cls_feat = conv_cls_prev_layer(x)
+        out_cls = self.conv_cls(cls_feat)
+
+        if self.use_edge_fusion:
+            # calculate the edge indices for the batch data
+            edge_indices_list = get_edge_indices(
+                input_metas, downsample_ratio, device=x.device)
+            edge_lens = [
+                edge_indices.shape[0] for edge_indices in edge_indices_list
+            ]
+            max_edge_len = max(edge_lens)
+            edge_indices = x.new_zeros((batch_size, max_edge_len, 2),
+                                       dtype=torch.long)
+            for i in range(batch_size):
+                edge_indices[i, :edge_lens[i]] = edge_indices_list[i]
+            # cls feature map edge fusion
+            out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices,
+                                         edge_lens, feat_h, feat_w)
+
+        bbox_pred = []
+
+        for i in range(len(self.group_reg_dims)):
+            reg_feat = x.clone()
+            # feature regression head
+            if len(self.reg_branch[i]) > 0:
+                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
+                    reg_feat = conv_reg_prev_layer(reg_feat)
+
+            for j, conv_reg in enumerate(self.conv_regs[i]):
+                out_reg = conv_reg(reg_feat)
+                #  Use Edge Fusion Module
+                if self.use_edge_fusion and (i, j) in self.edge_fusion_inds:
+                    # reg feature map edge fusion
+                    out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format(
+                        i, j))(reg_feat, out_reg, edge_indices, edge_lens,
+                               feat_h, feat_w)
+                bbox_pred.append(out_reg)
+
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+        cls_score = out_cls.sigmoid()  # turn to 0-1
+        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
+
+        return cls_score, bbox_pred
+
+    def get_bboxes(self, cls_scores, bbox_preds, input_metas):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+            bbox_preds (list[Tensor]): Box regression for each scale.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+        Returns:
+            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
+                Each item in result_list is 4-tuple.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        cam2imgs = torch.stack([
+            cls_scores[0].new_tensor(input_meta['cam2img'])
+            for input_meta in input_metas
+        ])
+        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
+            cls_scores[0],
+            bbox_preds[0],
+            input_metas,
+            cam2imgs=cam2imgs,
+            topk=100,
+            kernel=3)
+
+        result_list = []
+        for img_id in range(len(input_metas)):
+
+            bboxes = batch_bboxes[img_id]
+            scores = batch_scores[img_id]
+            labels = batch_topk_labels[img_id]
+
+            keep_idx = scores > 0.25
+            bboxes = bboxes[keep_idx]
+            scores = scores[keep_idx]
+            labels = labels[keep_idx]
+
+            bboxes = input_metas[img_id]['box_type_3d'](
+                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+            attrs = None
+            result_list.append((bboxes, scores, labels, attrs))
+
+        return result_list
+
+    def decode_heatmap(self,
+                       cls_score,
+                       reg_pred,
+                       input_metas,
+                       cam2imgs,
+                       topk=100,
+                       kernel=3):
+        """Transform outputs into detections raw bbox predictions.
+
+        Args:
+            class_score (Tensor): Center predict heatmap,
+                shape (B, num_classes, H, W).
+            reg_pred (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            input_metas (List[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cam2imgs (Tensor): Camera intrinsic matrix.
+                shape (N, 4, 4)
+            topk (int, optional): Get top k center keypoints from heatmap.
+                Default 100.
+            kernel (int, optional): Max pooling kernel for extract local
+                maximum pixels. Default 3.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
+               the following Tensors:
+              - batch_bboxes (Tensor): Coords of each 3D box.
+                    shape (B, k, 7)
+              - batch_scores (Tensor): Scores of each 3D box.
+                    shape (B, k)
+              - batch_topk_labels (Tensor): Categories of each 3D box.
+                    shape (B, k)
+        """
+        img_h, img_w = input_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = cls_score.shape
+
+        downsample_ratio = img_h / feat_h
+        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=topk)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        regression = transpose_and_gather_feat(reg_pred, batch_index)
+        regression = regression.view(-1, 8)
+
+        pred_base_centers2d = torch.cat(
+            [topk_xs.view(-1, 1),
+             topk_ys.view(-1, 1).float()], dim=1)
+        preds = self.bbox_coder.decode(regression, batch_topk_labels,
+                                       downsample_ratio, cam2imgs)
+        pred_locations = self.bbox_coder.decode_location(
+            pred_base_centers2d, preds['offsets2d'], preds['combined_depth'],
+            cam2imgs, downsample_ratio)
+        pred_yaws = self.bbox_coder.decode_orientation(
+            preds['orientations']).unsqueeze(-1)
+        pred_dims = preds['dimensions']
+        batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1)
+        batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size)
+        return batch_bboxes, batch_scores, batch_topk_labels
+
+    def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask,
+                        batch_indices, input_metas, downsample_ratio):
+        """Prepare predictions for computing loss.
+
+        Args:
+            pred_reg (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            labels3d (Tensor): Labels of each 3D box.
+                shape (B * max_objs, )
+            centers2d (Tensor): Coords of each projected 3D box
+                center on image. shape (N, 2)
+            reg_mask (Tensor): Indexes of the existence of the 3D box.
+                shape (B * max_objs, )
+            batch_indices (Tenosr): Batch indices of the 3D box.
+                shape (N, 3)
+            input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            downsample_ratio (int): The stride of feature map.
+
+        Returns:
+            dict: The predictions for computing loss.
+        """
+        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
+        w = pred_reg.shape[3]
+        cam2imgs = torch.stack([
+            centers2d.new_tensor(input_meta['cam2img'])
+            for input_meta in input_metas
+        ])
+        # (batch_size, 4, 4) -> (N, 4, 4)
+        cam2imgs = cam2imgs[batch_indices, :, :]
+        centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
+        centers2d_inds = centers2d_inds.view(batch, -1)
+        pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
+        pred_regression_pois = pred_regression.view(-1, channel)[reg_mask]
+        preds = self.bbox_coder.decode(pred_regression_pois, labels3d,
+                                       downsample_ratio, cam2imgs)
+
+        return preds
+
+    def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
+                    gt_labels_3d_list, centers2d_list, depths_list, feat_shape,
+                    img_shape, input_metas):
+        """Get training targets for batch images.
+``
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
+                image, shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each
+                box, shape (num_gt,).
+            gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
+                Ground truth bboxes of each image,
+                shape (num_gt, bbox_code_size).
+            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
+                each box, shape (num_gt,).
+            centers2d_list (list[Tensor]): Projected 3D centers onto 2D
+                image, shape (num_gt, 2).
+            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
+                image, each has shape (num_gt, 1).
+            feat_shape (tuple[int]): Feature map shape with value,
+                shape (B, _, H, W).
+            img_shape (tuple[int]): Image shape in [h, w] format.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple[Tensor, dict]: The Tensor value is the targets of
+                center heatmap, the dict has components below:
+              - base_centers2d_target (Tensor): Coords of each projected 3D box
+                    center on image. shape (B * max_objs, 2), [dtype: int]
+              - labels3d (Tensor): Labels of each 3D box.
+                    shape (N, )
+              - reg_mask (Tensor): Mask of the existence of the 3D box.
+                    shape (B * max_objs, )
+              - batch_indices (Tensor): Batch id of the 3D box.
+                    shape (N, )
+              - depth_target (Tensor): Depth target of each 3D box.
+                    shape (N, )
+              - keypoints2d_target (Tensor): Keypoints of each projected 3D box
+                    on image. shape (N, 10, 2)
+              - keypoints_mask (Tensor): Keypoints mask of each projected 3D
+                    box on image. shape (N, 10)
+              - keypoints_depth_mask (Tensor): Depths decoded from keypoints
+                    of each 3D box. shape (N, 3)
+              - orientations_target (Tensor): Orientation (encoded local yaw)
+                    target of each 3D box. shape (N, )
+              - offsets2d_target (Tensor): Offsets target of each projected
+                    3D box. shape (N, 2)
+              - dimensions_target (Tensor): Dimensions target of each 3D box.
+                    shape (N, 3)
+              - downsample_ratio (int): The stride of feature map.
+        """
+
+        img_h, img_w = img_shape[:2]
+        batch_size, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)  # 1/4
+        height_ratio = float(feat_h / img_h)  # 1/4
+
+        assert width_ratio == height_ratio
+
+        # Whether to filter the objects which are not in FOV.
+        if self.filter_outside_objs:
+            filter_outside_objs(gt_bboxes_list, gt_labels_list,
+                                gt_bboxes_3d_list, gt_labels_3d_list,
+                                centers2d_list, input_metas)
+
+        # transform centers2d to base centers2d for regression and
+        # heatmap generation.
+        # centers2d = int(base_centers2d) + offsets2d
+        base_centers2d_list, offsets2d_list, trunc_mask_list = \
+            handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas)
+
+        keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \
+            get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas)
+
+        center_heatmap_target = gt_bboxes_list[-1].new_zeros(
+            [batch_size, self.num_classes, feat_h, feat_w])
+
+        for batch_id in range(batch_size):
+            # project gt_bboxes from input image to feat map
+            gt_bboxes = gt_bboxes_list[batch_id] * width_ratio
+            gt_labels = gt_labels_list[batch_id]
+
+            # project base centers2d from input image to feat map
+            gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio
+            trunc_masks = trunc_mask_list[batch_id]
+
+            for j, base_center2d in enumerate(gt_base_centers2d):
+                if trunc_masks[j]:
+                    # for outside objects, generate ellipse heatmap
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0],
+                                      gt_bboxes[j][2] - base_center2d_x_int)
+                    scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1],
+                                      gt_bboxes[j][3] - base_center2d_y_int)
+                    radius_x = scale_box_w * self.edge_heatmap_ratio
+                    radius_y = scale_box_h * self.edge_heatmap_ratio
+                    radius_x, radius_y = max(0, int(radius_x)), max(
+                        0, int(radius_y))
+                    assert min(radius_x, radius_y) == 0
+                    ind = gt_labels[j]
+                    get_ellip_gaussian_2D(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius_x,
+                        radius_y)
+                else:
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1])
+                    scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0])
+                    radius = gaussian_radius([scale_box_h, scale_box_w],
+                                             min_overlap=0.7)
+                    radius = max(0, int(radius))
+                    ind = gt_labels[j]
+                    gen_gaussian_target(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius)
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list]
+        max_objs = max(num_ctrs)
+        batch_indices = [
+            centers2d_list[0].new_full((num_ctrs[i], ), i)
+            for i in range(batch_size)
+        ]
+        batch_indices = torch.cat(batch_indices, dim=0)
+        reg_mask = torch.zeros(
+            (batch_size, max_objs),
+            dtype=torch.bool).to(base_centers2d_list[0].device)
+        gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list)
+        gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device)
+
+        # encode original local yaw to multibin format
+        orienations_target = self.bbox_coder.encode(gt_bboxes_3d)
+
+        batch_base_centers2d = base_centers2d_list[0].new_zeros(
+            (batch_size, max_objs, 2))
+
+        for i in range(batch_size):
+            reg_mask[i, :num_ctrs[i]] = 1
+            batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i]
+
+        flatten_reg_mask = reg_mask.flatten()
+
+        # transform base centers2d from input scale to output scale
+        batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio
+
+        dimensions_target = gt_bboxes_3d.tensor[:, 3:6]
+        labels_3d = torch.cat(gt_labels_3d_list)
+        keypoints2d_target = torch.cat(keypoints2d_list)
+        keypoints_mask = torch.cat(keypoints_mask_list)
+        keypoints_depth_mask = torch.cat(keypoints_depth_mask_list)
+        offsets2d_target = torch.cat(offsets2d_list)
+        bboxes2d = torch.cat(gt_bboxes_list)
+
+        # transform FCOS style bbox into [x1, y1, x2, y2] format.
+        bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]],
+                                    dim=-1)
+        depths = torch.cat(depths_list)
+
+        target_labels = dict(
+            base_centers2d_target=batch_base_centers2d.int(),
+            labels3d=labels_3d,
+            reg_mask=flatten_reg_mask,
+            batch_indices=batch_indices,
+            bboxes2d_target=bboxes2d_target,
+            depth_target=depths,
+            keypoints2d_target=keypoints2d_target,
+            keypoints_mask=keypoints_mask,
+            keypoints_depth_mask=keypoints_depth_mask,
+            orienations_target=orienations_target,
+            offsets2d_target=offsets2d_target,
+            dimensions_target=dimensions_target,
+            downsample_ratio=1 / width_ratio)
+
+        return center_heatmap_target, avg_factor, target_labels
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                shape (num_gt, 4).
+            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
+                number is bbox_code_size.
+                shape (B, 7, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+                shape (num_gts, ).
+            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
+                truth. it is the flipped gt_bboxes
+            gt_labels_3d (list[Tensor]): Same as gt_labels.
+            centers2d (list[Tensor]): 2D centers on the image.
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth ground truth.
+                shape (num_gts, ).
+            attr_labels (list[Tensor]): Attributes indices of each box.
+                In kitti it's None.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+                Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        assert attr_labels is None
+        assert gt_bboxes_ignore is None
+        center2d_heatmap = cls_scores[0]
+        pred_reg = bbox_preds[0]
+
+        center2d_heatmap_target, avg_factor, target_labels = \
+            self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
+                             gt_labels_3d, centers2d, depths,
+                             center2d_heatmap.shape,
+                             input_metas[0]['pad_shape'],
+                             input_metas)
+
+        preds = self.get_predictions(
+            pred_reg=pred_reg,
+            labels3d=target_labels['labels3d'],
+            centers2d=target_labels['base_centers2d_target'],
+            reg_mask=target_labels['reg_mask'],
+            batch_indices=target_labels['batch_indices'],
+            input_metas=input_metas,
+            downsample_ratio=target_labels['downsample_ratio'])
+
+        # heatmap loss
+        loss_cls = self.loss_cls(
+            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
+
+        # bbox2d regression loss
+        loss_bbox = self.loss_bbox(preds['bboxes2d'],
+                                   target_labels['bboxes2d_target'])
+
+        # keypoints loss, the keypoints in predictions and target are all
+        # local coordinates. Check the mask dtype should be bool, not int
+        # or float to ensure the indexing is bool index
+        keypoints2d_mask = target_labels['keypoints2d_mask']
+        loss_keypoints = self.loss_keypoints(
+            preds['keypoints2d'][keypoints2d_mask],
+            target_labels['keypoints2d_target'][keypoints2d_mask])
+
+        # orientations loss
+        loss_dir = self.loss_dir(preds['orientations'],
+                                 target_labels['orientations_target'])
+
+        # dimensions loss
+        loss_dims = self.loss_dims(preds['dimensions'],
+                                   target_labels['dimensions_target'])
+
+        # offsets for center heatmap
+        loss_offsets2d = self.loss_offsets2d(preds['offsets2d'],
+                                             target_labels['offsets2d_target'])
+
+        # directly regressed depth loss with direct depth uncertainty loss
+        direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty'])
+        loss_weight_1 = self.loss_direct_depth.loss_weight
+        loss_direct_depth = self.loss_direct_depth(
+            preds['direct_depth'], target_labels['depth_target'],
+            direct_depth_weights)
+        loss_uncertainty_1 =\
+            preds['direct_depth_uncertainty'] * loss_weight_1
+        loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean()
+
+        # keypoints decoded depth loss with keypoints depth uncertainty loss
+        depth_mask = target_labels['keypoints_depth_mask']
+        depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3)
+        valid_keypoints_depth_uncertainty = preds[
+            'keypoints_depth_uncertainty'][depth_mask]
+        valid_keypoints_depth_weights = torch.exp(
+            -valid_keypoints_depth_uncertainty)
+        loss_keypoints_depth = self.loss_keypoint_depth(
+            preds['keypoints_depth'][depth_mask], depth_target[depth_mask],
+            valid_keypoints_depth_weights)
+        loss_weight_2 = self.loss_keypoints_depth.loss_weight
+        loss_uncertainty_2 =\
+            valid_keypoints_depth_uncertainty * loss_weight_2
+        loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean()
+
+        # combined depth loss for optimiaze the uncertainty
+        loss_combined_depth = self.loss_combined_depth(
+            preds['combined_depth'], target_labels['depth_target'])
+
+        loss_dict = dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_keypoints=loss_keypoints,
+            loss_dir=loss_dir,
+            loss_dims=loss_dims,
+            loss_offsets2d=loss_offsets2d,
+            loss_direct_depth=loss_direct_depth,
+            loss_keypoints_depth=loss_keypoints_depth,
+            loss_combined_depth=loss_combined_depth)
+
+        return loss_dict
--- a/mmdet3d/models/dense_heads/parta2_rpn_head.py
+++ b/mmdet3d/models/dense_heads/parta2_rpn_head.py
@@ -60,15 +60,15 @@ class PartA2RPNHead(Anchor3DHead):
                     type='Anchor3DRangeGenerator',
                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
                     strides=[2],
-                     sizes=[[1.6, 3.9, 1.56]],
+                     sizes=[[3.9, 1.6, 1.56]],
                     rotations=[0, 1.57],
                     custom_values=[],
                     reshape_out=False),
                 assigner_per_size=False,
                 assign_per_class=False,
                 diff_rad_by_sin=True,
-                 dir_offset=0,
-                 dir_limit_offset=1,
+                 dir_offset=-np.pi / 2,
+                 dir_limit_offset=0,
                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
                 loss_cls=dict(
                     type='CrossEntropyLoss',
@@ -100,20 +100,20 @@ class PartA2RPNHead(Anchor3DHead):
            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
            dir_cls_preds (list[torch.Tensor]): Multi-level direction
                class predictions.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes \
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes
                of each sample.
            gt_labels (list[torch.Tensor]): Labels of each sample.
            input_metas (list[dict]): Point cloud and image's meta info.
-            gt_bboxes_ignore (None | list[torch.Tensor]): Specify
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
                which bounding.

        Returns:
-            dict[str, list[torch.Tensor]]: Classification, bbox, and \
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
                direction losses of each level.

                - loss_rpn_cls (list[torch.Tensor]): Classification losses.
                - loss_rpn_bbox (list[torch.Tensor]): Box regression losses.
-                - loss_rpn_dir (list[torch.Tensor]): Direction classification \
+                - loss_rpn_dir (list[torch.Tensor]): Direction classification
                    losses.
        """
        loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds,
@@ -143,7 +143,7 @@ class PartA2RPNHead(Anchor3DHead):
            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
                in single batch.
            input_meta (list[dict]): Contain pcd and img's meta info.
-            cfg (None | :obj:`ConfigDict`): Training or testing config.
+            cfg (:obj:`ConfigDict`): Training or testing config.
            rescale (list[torch.Tensor]): whether th rescale bbox.

        Returns:
@@ -207,7 +207,7 @@ class PartA2RPNHead(Anchor3DHead):
        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
        # shape [k, num_class] before sigmoid
        # PartA2 need to keep raw classification score
-        # becase the bbox head in the second stage does not have
+        # because the bbox head in the second stage does not have
        # classification branch,
        # roi head need this score as classification score
        mlvl_cls_score = torch.cat(mlvl_cls_score)
@@ -240,7 +240,7 @@ class PartA2RPNHead(Anchor3DHead):
                Multi-level bbox.
            score_thr (int): Score threshold.
            max_num (int): Max number of bboxes after nms.
-            cfg (None | :obj:`ConfigDict`): Training or testing config.
+            cfg (:obj:`ConfigDict`): Training or testing config.
            input_meta (dict): Contain pcd and img's meta info.

        Returns: