Bump version to v1.1.0rc2

Bump to v1.1.0rc2

Bump version to v1.1.0rc2
Bump to v1.1.0rc2
d7067e44 · Wenwei Zhang · GitHub · 28fe73d2 · fb0e57e5 · d7067e44
Unverified Commit d7067e44 authored Dec 03, 2022 by Wenwei Zhang Committed by GitHub Dec 03, 2022
20 changed files
--- a/mmdet3d/models/losses/__init__.py
+++ b/mmdet3d/models/losses/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy
+
 from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss
 from .chamfer_distance import ChamferDistance, chamfer_distance
 from .multibin_loss import MultiBinLoss
 from .paconv_regularization_loss import PAConvRegularizationLoss
+from .rotated_iou_loss import RotatedIoU3DLoss, rotated_iou_3d_loss
 from .uncertain_smooth_l1_loss import UncertainL1Loss, UncertainSmoothL1Loss

 __all__ = [
    'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance',
    'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss',
    'PAConvRegularizationLoss', 'UncertainL1Loss', 'UncertainSmoothL1Loss',
-    'MultiBinLoss'
+    'MultiBinLoss', 'RotatedIoU3DLoss', 'rotated_iou_3d_loss'
 ]
--- a/mmdet3d/models/losses/axis_aligned_iou_loss.py
+++ b/mmdet3d/models/losses/axis_aligned_iou_loss.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+from mmdet.models.losses.utils import weighted_loss
 from torch import nn as nn

 from mmdet3d.registry import MODELS
 from mmdet3d.structures import AxisAlignedBboxOverlaps3D
-from mmdet.models.losses.utils import weighted_loss


 @weighted_loss

--- a/mmdet3d/models/losses/multibin_loss.py
+++ b/mmdet3d/models/losses/multibin_loss.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+from mmdet.models.losses.utils import weighted_loss
 from torch import nn as nn
 from torch.nn import functional as F

 from mmdet3d.registry import MODELS
-from mmdet.models.losses.utils import weighted_loss


 @weighted_loss

--- a/mmdet3d/models/losses/paconv_regularization_loss.py
+++ b/mmdet3d/models/losses/paconv_regularization_loss.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+from mmdet.models.losses.utils import weight_reduce_loss
 from torch import nn as nn

 from mmdet3d.registry import MODELS
-from mmdet.models.losses.utils import weight_reduce_loss
 from ..layers import PAConv, PAConvCUDA



--- a/mmdet3d/models/losses/rotated_iou_loss.py
+++ b/mmdet3d/models/losses/rotated_iou_loss.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmcv.ops import diff_iou_rotated_3d
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def rotated_iou_3d_loss(pred, target: Tensor) -> Tensor:
+    """Calculate the IoU loss (1-IoU) of two sets of rotated bounding boxes.
+
+    Note that predictions and targets are one-to-one corresponded.
+    Args:
+        pred (torch.Tensor): Bbox predictions with shape [N, 7]
+            (x, y, z, w, l, h, alpha).
+        target (torch.Tensor): Bbox targets (gt) with shape [N, 7]
+            (x, y, z, w, l, h, alpha).
+    Returns:
+        torch.Tensor: IoU loss between predictions and targets.
+    """
+    iou_loss = 1 - diff_iou_rotated_3d(pred.unsqueeze(0),
+                                       target.unsqueeze(0))[0]
+    return iou_loss
+
+
+@MODELS.register_module()
+class RotatedIoU3DLoss(nn.Module):
+    """Calculate the IoU loss (1-IoU) of rotated bounding boxes.
+
+    Args:
+        reduction (str): Method to reduce losses.
+            The valid reduction method are none, sum or mean.
+        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: Optional[float] = 1.0):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function of loss calculation.
+
+        Args:
+            pred (torch.Tensor): Bbox predictions with shape [..., 7]
+                (x, y, z, w, l, h, alpha).
+            target (torch.Tensor): Bbox targets (gt) with shape [..., 7]
+                (x, y, z, w, l, h, alpha).
+            weight (torch.Tensor | float, optional): Weight of loss.
+                Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: IoU loss between predictions and targets.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            return pred.sum() * weight.sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            weight = weight.mean(-1)
+        loss = self.loss_weight * rotated_iou_3d_loss(
+            pred,
+            target,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+
+        return loss
--- a/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
+++ b/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+from mmdet.models.losses.utils import weighted_loss
 from torch import nn as nn

 from mmdet3d.registry import MODELS
-from mmdet.models.losses.utils import weighted_loss


 @weighted_loss

--- a/mmdet3d/models/middle_encoders/__init__.py
+++ b/mmdet3d/models/middle_encoders/__init__.py
@@ -2,7 +2,9 @@
 from .pillar_scatter import PointPillarsScatter
 from .sparse_encoder import SparseEncoder, SparseEncoderSASSD
 from .sparse_unet import SparseUNet
+from .voxel_set_abstraction import VoxelSetAbstraction

 __all__ = [
-    'PointPillarsScatter', 'SparseEncoder', 'SparseEncoderSASSD', 'SparseUNet'
+    'PointPillarsScatter', 'SparseEncoder', 'SparseEncoderSASSD', 'SparseUNet',
+    'VoxelSetAbstraction'
 ]
--- a/mmdet3d/models/middle_encoders/sparse_encoder.py
+++ b/mmdet3d/models/middle_encoders/sparse_encoder.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
 from mmcv.ops import points_in_boxes_all, three_interpolate, three_nn
+from mmdet.models.losses import sigmoid_focal_loss, smooth_l1_loss
+from torch import Tensor
 from torch import nn as nn

 from mmdet3d.models.layers import SparseBasicBlock, make_sparse_convmodule
 from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
 from mmdet3d.registry import MODELS
-from mmdet.models.losses import sigmoid_focal_loss, smooth_l1_loss
+from mmdet3d.structures import BaseInstance3DBoxes

 if IS_SPCONV2_AVAILABLE:
    from spconv.pytorch import SparseConvTensor, SparseSequential
@@ -37,6 +41,8 @@ class SparseEncoder(nn.Module):
            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
        block_type (str, optional): Type of the block to use.
            Defaults to 'conv_module'.
+        return_middle_feats (bool): Whether output middle features.
+            Default to False.
    """

    def __init__(self,
@@ -50,7 +56,8 @@ class SparseEncoder(nn.Module):
                                                                        64)),
                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
                                                                 1)),
-                 block_type='conv_module'):
+                 block_type='conv_module',
+                 return_middle_feats=False):
        super().__init__()
        assert block_type in ['conv_module', 'basicblock']
        self.sparse_shape = sparse_shape
@@ -62,6 +69,7 @@ class SparseEncoder(nn.Module):
        self.encoder_paddings = encoder_paddings
        self.stage_num = len(self.encoder_channels)
        self.fp16_enabled = False
+        self.return_middle_feats = return_middle_feats
        # Spconv init all weight on its own

        assert isinstance(order, tuple) and len(order) == 3
@@ -113,7 +121,14 @@ class SparseEncoder(nn.Module):
            batch_size (int): Batch size.

        Returns:
-            dict: Backbone features.
+            torch.Tensor | tuple[torch.Tensor, list]: Return spatial features
+                include:
+
+            - spatial_features (torch.Tensor): Spatial features are out from
+                the last layer.
+            - encode_features (List[SparseConvTensor], optional): Middle layer
+                output features. When self.return_middle_feats is True, the
+                module returns middle features.
        """
        coors = coors.int()
        input_sp_tensor = SparseConvTensor(voxel_features, coors,
@@ -133,7 +148,10 @@ class SparseEncoder(nn.Module):
        N, C, D, H, W = spatial_features.shape
        spatial_features = spatial_features.view(N, C * D, H, W)

-        return spatial_features
+        if self.return_middle_feats:
+            return spatial_features, encode_features
+        else:
+            return spatial_features

    def make_encoder_layers(self,
                            make_block,
@@ -238,17 +256,17 @@ class SparseEncoderSASSD(SparseEncoder):
    """

    def __init__(self,
-                 in_channels,
-                 sparse_shape,
-                 order=('conv', 'norm', 'act'),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 base_channels=16,
-                 output_channels=128,
-                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
-                                                                        64)),
-                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
-                                                                 1)),
-                 block_type='conv_module'):
+                 in_channels: int,
+                 sparse_shape: List[int],
+                 order: Tuple[str] = ('conv', 'norm', 'act'),
+                 norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 base_channels: int = 16,
+                 output_channels: int = 128,
+                 encoder_channels: Tuple[tuple] = ((16, ), (32, 32, 32),
+                                                   (64, 64, 64), (64, 64, 64)),
+                 encoder_paddings: Tuple[tuple] = ((1, ), (1, 1, 1), (1, 1, 1),
+                                                   ((0, 1, 1), 1, 1)),
+                 block_type: str = 'conv_module'):
        super(SparseEncoderSASSD, self).__init__(
            in_channels=in_channels,
            sparse_shape=sparse_shape,
@@ -264,7 +282,11 @@ class SparseEncoderSASSD(SparseEncoder):
        self.point_cls = nn.Linear(64, 1, bias=False)
        self.point_reg = nn.Linear(64, 3, bias=False)

-    def forward(self, voxel_features, coors, batch_size, test_mode=False):
+    def forward(self,
+                voxel_features: Tensor,
+                coors: Tensor,
+                batch_size: Tensor,
+                test_mode: bool = False) -> Tuple[Tensor, tuple]:
        """Forward of SparseEncoder.

        Args:
@@ -276,7 +298,7 @@ class SparseEncoderSASSD(SparseEncoder):
                Defaults to False.

        Returns:
-            dict: Backbone features.
+            Tensor: Backbone features.
            tuple[torch.Tensor]: Mean feature value of the points,
                Classification result of the points,
                Regression offsets of the points.
@@ -333,14 +355,17 @@ class SparseEncoderSASSD(SparseEncoder):

        return spatial_features, point_misc

-    def get_auxiliary_targets(self, nxyz, gt_boxes3d, enlarge=1.0):
+    def get_auxiliary_targets(self,
+                              points_feats: Tensor,
+                              gt_bboxes_3d: List[BaseInstance3DBoxes],
+                              enlarge: float = 1.0) -> Tuple[Tensor, Tensor]:
        """Get auxiliary target.

        Args:
-            nxyz (torch.Tensor): Mean features of the points.
-            gt_boxes3d (torch.Tensor): Coordinates in shape (N, 4),
-                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
-            enlarge (int, optional): Enlaged scale. Defaults to 1.0.
+            points_feats (torch.Tensor): Mean features of the points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):  Ground truth
+                boxes for each sample.
+            enlarge (float, optional): Enlaged scale. Defaults to 1.0.

        Returns:
            tuple[torch.Tensor]: Label of the points and
@@ -348,31 +373,32 @@ class SparseEncoderSASSD(SparseEncoder):
        """
        center_offsets = list()
        pts_labels = list()
-        for i in range(len(gt_boxes3d)):
-            boxes3d = gt_boxes3d[i].tensor.cpu()
-            idx = torch.nonzero(nxyz[:, 0] == i).view(-1)
-            new_xyz = nxyz[idx, 1:].cpu()
+        for i in range(len(gt_bboxes_3d)):
+            boxes3d = gt_bboxes_3d[i].tensor.detach().clone()
+            idx = torch.nonzero(points_feats[:, 0] == i).view(-1)
+            point_xyz = points_feats[idx, 1:].detach().clone()

            boxes3d[:, 3:6] *= enlarge

            pts_in_flag, center_offset = self.calculate_pts_offsets(
-                new_xyz, boxes3d)
+                point_xyz, boxes3d)
            pts_label = pts_in_flag.max(0)[0].byte()
            pts_labels.append(pts_label)
            center_offsets.append(center_offset)

-        center_offsets = torch.cat(center_offsets).cuda()
+        center_offsets = torch.cat(center_offsets)
        pts_labels = torch.cat(pts_labels).to(center_offsets.device)

        return pts_labels, center_offsets

-    def calculate_pts_offsets(self, points, boxes):
+    def calculate_pts_offsets(self, points: Tensor,
+                              bboxes_3d: Tensor) -> Tuple[Tensor, Tensor]:
        """Find all boxes in which each point is, as well as the offsets from
        the box centers.

        Args:
-            points (torch.Tensor): [M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-            boxes (torch.Tensor): [T, 7],
+            points (torch.Tensor): [M, 3], [x, y, z] in LiDAR coordinate
+            bboxes_3d (torch.Tensor): [T, 7],
                num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
                (x, y, z) is the bottom center.

@@ -383,44 +409,41 @@ class SparseEncoderSASSD(SparseEncoder):
                if it belows to the box, with the shape of (M, 3).
                Default background = 0.
        """
-        boxes_num = len(boxes)
+        boxes_num = len(bboxes_3d)
        pts_num = len(points)
-        points = points.cuda()
-        boxes = boxes.to(points.device)

-        box_idxs_of_pts = points_in_boxes_all(points[None, ...], boxes[None,
+        box_indices = points_in_boxes_all(points[None, ...], bboxes_3d[None,
                                                                       ...])
-
-        pts_indices = box_idxs_of_pts.squeeze(0).transpose(0, 1)
-
+        pts_indices = box_indices.squeeze(0).transpose(0, 1)
        center_offsets = torch.zeros_like(points).to(points.device)

        for i in range(boxes_num):
            for j in range(pts_num):
                if pts_indices[i][j] == 1:
-                    center_offsets[j][0] = points[j][0] - boxes[i][0]
-                    center_offsets[j][1] = points[j][1] - boxes[i][1]
+                    center_offsets[j][0] = points[j][0] - bboxes_3d[i][0]
+                    center_offsets[j][1] = points[j][1] - bboxes_3d[i][1]
                    center_offsets[j][2] = (
-                        points[j][2] - (boxes[i][2] + boxes[i][2] / 2.0))
-        return pts_indices.cpu(), center_offsets.cpu()
+                        points[j][2] -
+                        (bboxes_3d[i][2] + bboxes_3d[i][2] / 2.0))
+        return pts_indices, center_offsets

-    def aux_loss(self, points, point_cls, point_reg, gt_bboxes):
+    def aux_loss(self, points: Tensor, point_cls: Tensor, point_reg: Tensor,
+                 gt_bboxes_3d: Tensor) -> dict:
        """Calculate auxiliary loss.

        Args:
            points (torch.Tensor): Mean feature value of the points.
            point_cls (torch.Tensor): Classification result of the points.
            point_reg (torch.Tensor): Regression offsets of the points.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
                boxes for each sample.

        Returns:
-            dict: Backbone features.
+            dict: Auxiliary loss.
        """
-        num_boxes = len(gt_bboxes)
-
+        num_boxes = len(gt_bboxes_3d)
        pts_labels, center_targets = self.get_auxiliary_targets(
-            points, gt_bboxes)
+            points, gt_bboxes_3d)

        rpn_cls_target = pts_labels.long()
        pos = (pts_labels > 0).float()
@@ -449,11 +472,13 @@ class SparseEncoderSASSD(SparseEncoder):

        return dict(aux_loss_cls=aux_loss_cls, aux_loss_reg=aux_loss_reg)

-    def make_auxiliary_points(self,
-                              source_tensor,
-                              target,
-                              offset=(0., -40., -3.),
-                              voxel_size=(.05, .05, .1)):
+    def make_auxiliary_points(
+        self,
+        source_tensor: Tensor,
+        target: Tensor,
+        offset: Tuple = (0., -40., -3.),
+        voxel_size: Tuple = (.05, .05, .1)
+    ) -> Tensor:
        """Make auxiliary points for loss computation.

        Args:

--- a/mmdet3d/models/middle_encoders/voxel_set_abstraction.py
+++ b/mmdet3d/models/middle_encoders/voxel_set_abstraction.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import mmengine
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops.furthest_point_sample import furthest_point_sample
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import InstanceList
+
+
+def bilinear_interpolate_torch(inputs, x, y):
+    """Bilinear interpolate for inputs."""
+    x0 = torch.floor(x).long()
+    x1 = x0 + 1
+
+    y0 = torch.floor(y).long()
+    y1 = y0 + 1
+
+    x0 = torch.clamp(x0, 0, inputs.shape[1] - 1)
+    x1 = torch.clamp(x1, 0, inputs.shape[1] - 1)
+    y0 = torch.clamp(y0, 0, inputs.shape[0] - 1)
+    y1 = torch.clamp(y1, 0, inputs.shape[0] - 1)
+
+    Ia = inputs[y0, x0]
+    Ib = inputs[y1, x0]
+    Ic = inputs[y0, x1]
+    Id = inputs[y1, x1]
+
+    wa = (x1.type_as(x) - x) * (y1.type_as(y) - y)
+    wb = (x1.type_as(x) - x) * (y - y0.type_as(y))
+    wc = (x - x0.type_as(x)) * (y1.type_as(y) - y)
+    wd = (x - x0.type_as(x)) * (y - y0.type_as(y))
+    ans = torch.t((torch.t(Ia) * wa)) + torch.t(torch.t(Ib) * wb) + torch.t(
+        torch.t(Ic) * wc) + torch.t(torch.t(Id) * wd)
+    return ans
+
+
+@MODELS.register_module()
+class VoxelSetAbstraction(BaseModule):
+    """Voxel set abstraction module for PVRCNN and PVRCNN++.
+
+    Args:
+        num_keypoints (int): The number of key points sampled from
+            raw points cloud.
+        fused_out_channel (int): Key points feature output channels
+            num after fused. Default to 128.
+        voxel_size (list[float]): Size of voxels. Defaults to
+            [0.05, 0.05, 0.1].
+        point_cloud_range (list[float]): Point cloud range. Defaults to
+            [0, -40, -3, 70.4, 40, 1].
+        voxel_sa_cfgs_list (List[dict or ConfigDict], optional): List of SA
+            module cfg. Used to gather key points features from multi-wise
+            voxel features. Default to None.
+        rawpoints_sa_cfgs (dict or ConfigDict, optional): SA module cfg.
+            Used to gather key points features from raw points. Default to
+            None.
+        bev_feat_channel (int): Bev features channels num.
+            Default to 256.
+        bev_scale_factor (int): Bev features scale factor. Default to 8.
+        voxel_center_as_source (bool): Whether used voxel centers as points
+            cloud key points. Defaults to False.
+        norm_cfg (dict[str]): Config of normalization layer. Default
+            used dict(type='BN1d', eps=1e-5, momentum=0.1).
+        bias (bool | str, optional): If specified as `auto`, it will be
+            decided by `norm_cfg`. `bias` will be set as True if
+            `norm_cfg` is None, otherwise False. Default: 'auto'.
+    """
+
+    def __init__(self,
+                 num_keypoints: int,
+                 fused_out_channel: int = 128,
+                 voxel_size: list = [0.05, 0.05, 0.1],
+                 point_cloud_range: list = [0, -40, -3, 70.4, 40, 1],
+                 voxel_sa_cfgs_list: Optional[list] = None,
+                 rawpoints_sa_cfgs: Optional[dict] = None,
+                 bev_feat_channel: int = 256,
+                 bev_scale_factor: int = 8,
+                 voxel_center_as_source: bool = False,
+                 norm_cfg: dict = dict(type='BN2d', eps=1e-5, momentum=0.1),
+                 bias: str = 'auto') -> None:
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        self.fused_out_channel = fused_out_channel
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.voxel_center_as_source = voxel_center_as_source
+
+        gathered_channel = 0
+
+        if rawpoints_sa_cfgs is not None:
+            self.rawpoints_sa_layer = MODELS.build(rawpoints_sa_cfgs)
+            gathered_channel += sum(
+                [x[-1] for x in rawpoints_sa_cfgs.mlp_channels])
+        else:
+            self.rawpoints_sa_layer = None
+
+        if voxel_sa_cfgs_list is not None:
+            self.voxel_sa_configs_list = voxel_sa_cfgs_list
+            self.voxel_sa_layers = nn.ModuleList()
+            for voxel_sa_config in voxel_sa_cfgs_list:
+                cur_layer = MODELS.build(voxel_sa_config)
+                self.voxel_sa_layers.append(cur_layer)
+                gathered_channel += sum(
+                    [x[-1] for x in voxel_sa_config.mlp_channels])
+        else:
+            self.voxel_sa_layers = None
+
+        if bev_feat_channel is not None and bev_scale_factor is not None:
+            self.bev_cfg = mmengine.Config(
+                dict(
+                    bev_feat_channels=bev_feat_channel,
+                    bev_scale_factor=bev_scale_factor))
+            gathered_channel += bev_feat_channel
+        else:
+            self.bev_cfg = None
+        self.point_feature_fusion_layer = nn.Sequential(
+            ConvModule(
+                gathered_channel,
+                fused_out_channel,
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                conv_cfg=dict(type='Conv2d'),
+                norm_cfg=norm_cfg,
+                bias=bias))
+
+    def interpolate_from_bev_features(self, keypoints: torch.Tensor,
+                                      bev_features: torch.Tensor,
+                                      batch_size: int,
+                                      bev_scale_factor: int) -> torch.Tensor:
+        """Gather key points features from bev feature map by interpolate.
+
+        Args:
+            keypoints (torch.Tensor): Sampled key points with shape
+                (N1 + N2 + ..., NDim).
+            bev_features (torch.Tensor): Bev feature map from the first
+                stage with shape (B, C, H, W).
+            batch_size (int): Input batch size.
+            bev_scale_factor (int): Bev feature map scale factor.
+
+        Returns:
+            torch.Tensor: Key points features gather from bev feature
+                map with shape (N1 + N2 + ..., C)
+        """
+        x_idxs = (keypoints[..., 0] -
+                  self.point_cloud_range[0]) / self.voxel_size[0]
+        y_idxs = (keypoints[..., 1] -
+                  self.point_cloud_range[1]) / self.voxel_size[1]
+
+        x_idxs = x_idxs / bev_scale_factor
+        y_idxs = y_idxs / bev_scale_factor
+
+        point_bev_features_list = []
+        for k in range(batch_size):
+            cur_x_idxs = x_idxs[k, ...]
+            cur_y_idxs = y_idxs[k, ...]
+            cur_bev_features = bev_features[k].permute(1, 2, 0)  # (H, W, C)
+            point_bev_features = bilinear_interpolate_torch(
+                cur_bev_features, cur_x_idxs, cur_y_idxs)
+            point_bev_features_list.append(point_bev_features)
+
+        point_bev_features = torch.cat(
+            point_bev_features_list, dim=0)  # (N1 + N2 + ..., C)
+        return point_bev_features.view(batch_size, keypoints.shape[1], -1)
+
+    def get_voxel_centers(self, coors: torch.Tensor,
+                          scale_factor: float) -> torch.Tensor:
+        """Get voxel centers coordinate.
+
+        Args:
+            coors (torch.Tensor): Coordinates of voxels shape is Nx(1+NDim),
+                where 1 represents the batch index.
+            scale_factor (float): Scale factor.
+
+        Returns:
+            torch.Tensor: Voxel centers coordinate with shape (N, 3).
+        """
+        assert coors.shape[1] == 4
+        voxel_centers = coors[:, [3, 2, 1]].float()  # (xyz)
+        voxel_size = torch.tensor(
+            self.voxel_size,
+            device=voxel_centers.device).float() * scale_factor
+        pc_range = torch.tensor(
+            self.point_cloud_range[0:3], device=voxel_centers.device).float()
+        voxel_centers = (voxel_centers + 0.5) * voxel_size + pc_range
+        return voxel_centers
+
+    def sample_key_points(self, points: List[torch.Tensor],
+                          coors: torch.Tensor) -> torch.Tensor:
+        """Sample key points from raw points cloud.
+
+        Args:
+            points (List[torch.Tensor]): Point cloud of each sample.
+            coors (torch.Tensor): Coordinates of voxels shape is Nx(1+NDim),
+                where 1 represents the batch index.
+
+        Returns:
+            torch.Tensor: (B, M, 3) Key points of each sample.
+                M is num_keypoints.
+        """
+        assert points is not None or coors is not None
+        if self.voxel_center_as_source:
+            _src_points = self.get_voxel_centers(coors=coors, scale_factor=1)
+            batch_size = coors[-1, 0].item() + 1
+            src_points = [
+                _src_points[coors[:, 0] == b] for b in range(batch_size)
+            ]
+        else:
+            src_points = [p[..., :3] for p in points]
+
+        keypoints_list = []
+        for points_to_sample in src_points:
+            num_points = points_to_sample.shape[0]
+            cur_pt_idxs = furthest_point_sample(
+                points_to_sample.unsqueeze(dim=0).contiguous(),
+                self.num_keypoints).long()[0]
+
+            if num_points < self.num_keypoints:
+                times = int(self.num_keypoints / num_points) + 1
+                non_empty = cur_pt_idxs[:num_points]
+                cur_pt_idxs = non_empty.repeat(times)[:self.num_keypoints]
+
+            keypoints = points_to_sample[cur_pt_idxs]
+
+            keypoints_list.append(keypoints)
+        keypoints = torch.stack(keypoints_list, dim=0)  # (B, M, 3)
+        return keypoints
+
+    def forward(self, batch_inputs_dict: dict, feats_dict: dict,
+                rpn_results_list: InstanceList) -> dict:
+        """Extract point-wise features from multi-input.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'voxels' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - voxels (dict[torch.Tensor]): Voxels of the batch sample.
+            feats_dict (dict): Contains features from the first
+                stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+
+        Returns:
+            dict: Contain Point-wise features, include:
+                - keypoints (torch.Tensor): Sampled key points.
+                - keypoint_features (torch.Tensor): Gathered key points
+                    features from multi input.
+                - fusion_keypoint_features (torch.Tensor): Fusion
+                    keypoint_features by point_feature_fusion_layer.
+        """
+        points = batch_inputs_dict['points']
+        voxel_encode_features = feats_dict['multi_scale_3d_feats']
+        bev_encode_features = feats_dict['spatial_feats']
+        if self.voxel_center_as_source:
+            voxels_coors = batch_inputs_dict['voxels']['coors']
+        else:
+            voxels_coors = None
+        keypoints = self.sample_key_points(points, voxels_coors)
+
+        point_features_list = []
+        batch_size = len(points)
+
+        if self.bev_cfg is not None:
+            point_bev_features = self.interpolate_from_bev_features(
+                keypoints, bev_encode_features, batch_size,
+                self.bev_cfg.bev_scale_factor)
+            point_features_list.append(point_bev_features.contiguous())
+
+        batch_size, num_keypoints, _ = keypoints.shape
+        key_xyz = keypoints.view(-1, 3)
+        key_xyz_batch_cnt = key_xyz.new_zeros(batch_size).int().fill_(
+            num_keypoints)
+
+        if self.rawpoints_sa_layer is not None:
+            batch_points = torch.cat(points, dim=0)
+            batch_cnt = [len(p) for p in points]
+            xyz = batch_points[:, :3].contiguous()
+            features = None
+            if batch_points.size(1) > 0:
+                features = batch_points[:, 3:].contiguous()
+            xyz_batch_cnt = xyz.new_tensor(batch_cnt, dtype=torch.int32)
+
+            pooled_points, pooled_features = self.rawpoints_sa_layer(
+                xyz=xyz.contiguous(),
+                xyz_batch_cnt=xyz_batch_cnt,
+                new_xyz=key_xyz.contiguous(),
+                new_xyz_batch_cnt=key_xyz_batch_cnt,
+                features=features.contiguous(),
+            )
+
+            point_features_list.append(pooled_features.contiguous().view(
+                batch_size, num_keypoints, -1))
+        if self.voxel_sa_layers is not None:
+            for k, voxel_sa_layer in enumerate(self.voxel_sa_layers):
+                cur_coords = voxel_encode_features[k].indices
+                xyz = self.get_voxel_centers(
+                    coors=cur_coords,
+                    scale_factor=self.voxel_sa_configs_list[k].scale_factor
+                ).contiguous()
+                xyz_batch_cnt = xyz.new_zeros(batch_size).int()
+                for bs_idx in range(batch_size):
+                    xyz_batch_cnt[bs_idx] = (cur_coords[:, 0] == bs_idx).sum()
+
+                pooled_points, pooled_features = voxel_sa_layer(
+                    xyz=xyz.contiguous(),
+                    xyz_batch_cnt=xyz_batch_cnt,
+                    new_xyz=key_xyz.contiguous(),
+                    new_xyz_batch_cnt=key_xyz_batch_cnt,
+                    features=voxel_encode_features[k].features.contiguous(),
+                )
+                point_features_list.append(pooled_features.contiguous().view(
+                    batch_size, num_keypoints, -1))
+
+        point_features = torch.cat(
+            point_features_list, dim=-1).view(batch_size * num_keypoints, -1,
+                                              1)
+
+        fusion_point_features = self.point_feature_fusion_layer(
+            point_features.unsqueeze(dim=-1)).squeeze(dim=-1)
+
+        batch_idxs = torch.arange(
+            batch_size * num_keypoints, device=keypoints.device
+        ) // num_keypoints  # batch indexes of each key points
+        batch_keypoints_xyz = torch.cat(
+            (batch_idxs.to(key_xyz.dtype).unsqueeze(dim=-1), key_xyz), dim=-1)
+
+        return dict(
+            keypoint_features=point_features.squeeze(dim=-1),
+            fusion_keypoint_features=fusion_point_features.squeeze(dim=-1),
+            keypoints=batch_keypoints_xyz)
--- a/mmdet3d/models/necks/__init__.py
+++ b/mmdet3d/models/necks/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmdet.models.necks.fpn import FPN
+
 from .dla_neck import DLANeck
 from .imvoxel_neck import OutdoorImVoxelNeck
 from .pointnet2_fp_neck import PointNetFPNeck

--- a/mmdet3d/models/roi_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/__init__.py
@@ -5,10 +5,11 @@ from .h3d_roi_head import H3DRoIHead
 from .mask_heads import PointwiseSemanticHead, PrimitiveHead
 from .part_aggregation_roi_head import PartAggregationROIHead
 from .point_rcnn_roi_head import PointRCNNRoIHead
+from .pv_rcnn_roi_head import PVRCNNRoiHead
 from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor

 __all__ = [
    'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
    'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor',
-    'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead'
+    'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead', 'PVRCNNRoiHead'
 ]
--- a/mmdet3d/models/roi_heads/base_3droi_head.py
+++ b/mmdet3d/models/roi_heads/base_3droi_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet.models.roi_heads import BaseRoIHead

+from mmdet3d.registry import MODELS, TASK_UTILS
+

 class Base3DRoIHead(BaseRoIHead):
    """Base class for 3d RoIHeads."""

--- a/mmdet3d/models/roi_heads/bbox_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/__init__.py
@@ -3,12 +3,14 @@ from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead,
                                               DoubleConvFCBBoxHead,
                                               Shared2FCBBoxHead,
                                               Shared4Conv1FCBBoxHead)
+
 from .h3d_bbox_head import H3DBboxHead
 from .parta2_bbox_head import PartA2BboxHead
 from .point_rcnn_bbox_head import PointRCNNBboxHead
+from .pv_rcnn_bbox_head import PVRCNNBBoxHead

 __all__ = [
    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead',
-    'H3DBboxHead', 'PointRCNNBboxHead'
+    'H3DBboxHead', 'PointRCNNBboxHead', 'PVRCNNBBoxHead'
 ]
--- a/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Tuple

 import torch
 from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
 from mmengine.model import BaseModule
 from mmengine.structures import InstanceData
 from torch import Tensor
@@ -15,7 +16,6 @@ from mmdet3d.models.losses import chamfer_distance
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures import (BaseInstance3DBoxes, DepthInstance3DBoxes,
                                Det3DDataSample)
-from mmdet.models.utils import multi_apply


 @MODELS.register_module()

--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
@@ -4,6 +4,7 @@ from typing import Dict, List, Tuple
 import numpy as np
 import torch
 from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
 from mmengine.model import normal_init
 from mmengine.structures import InstanceData
 from torch import Tensor
@@ -11,7 +12,6 @@ from torch import Tensor
 from mmdet3d.models import make_sparse_convmodule
 from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
 from mmdet3d.utils.typing import InstanceList
-from mmdet.models.utils import multi_apply

 if IS_SPCONV2_AVAILABLE:
    from spconv.pytorch import (SparseConvTensor, SparseMaxPool3d,

--- a/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
@@ -6,6 +6,7 @@ import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 from mmcv.cnn.bricks import build_conv_layer
+from mmdet.models.utils import multi_apply
 from mmengine.model import BaseModule, normal_init
 from mmengine.structures import InstanceData
 from torch import Tensor
@@ -16,7 +17,6 @@ from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
                                        rotation_3d_in_axis, xywhr2xyxyr)
 from mmdet3d.utils.typing import InstanceList, SamplingResultList
-from mmdet.models.utils import multi_apply


 @MODELS.register_module()

--- a/mmdet3d/models/roi_heads/bbox_heads/pv_rcnn_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/pv_rcnn_bbox_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import nn as nn
+
+from mmdet3d.models.builder import build_loss
+from mmdet3d.models.layers import nms_bev, nms_normal_bev
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
+                                        rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.utils import InstanceList
+
+
+@MODELS.register_module()
+class PVRCNNBBoxHead(BaseModule):
+    """PVRCNN BBox head.
+
+    Args:
+        in_channels (int): The number of input channel.
+        grid_size (int): The number of grid points in roi bbox.
+        num_classes (int): The number of classes.
+        class_agnostic (bool): Whether generate class agnostic prediction.
+            Defaults to True.
+        shared_fc_channels (tuple(int)): Out channels of each shared fc layer.
+            Defaults to (256, 256).
+        cls_channels (tuple(int)): Out channels of each classification layer.
+            Defaults to (256, 256).
+        reg_channels (tuple(int)): Out channels of each regression layer.
+            Defaults to (256, 256).
+        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.5.
+        with_corner_loss (bool): Whether to use corner loss or not.
+            Defaults to True.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head.
+            Defaults to dict(type='DeltaXYZWLHRBBoxCoder').
+        norm_cfg (dict): Type of normalization method.
+            Defaults to dict(type='BN1d', eps=1e-5, momentum=0.1)
+        loss_bbox (dict): Config dict of box regression loss.
+        loss_cls (dict): Config dict of classifacation loss.
+        init_cfg (dict, optional): Initialize config of
+            model.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        grid_size: int,
+        num_classes: int,
+        class_agnostic: bool = True,
+        shared_fc_channels: Tuple[int] = (256, 256),
+        cls_channels: Tuple[int] = (256, 256),
+        reg_channels: Tuple[int] = (256, 256),
+        dropout_ratio: float = 0.3,
+        with_corner_loss: bool = True,
+        bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
+        norm_cfg: dict = dict(type='BN2d', eps=1e-5, momentum=0.1),
+        loss_bbox: dict = dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_cls: dict = dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='none',
+            loss_weight=1.0),
+        init_cfg: Optional[dict] = dict(
+            type='Xavier', layer=['Conv2d', 'Conv1d'], distribution='uniform')
+    ) -> None:
+        super(PVRCNNBBoxHead, self).__init__(init_cfg=init_cfg)
+        self.init_cfg = init_cfg
+        self.num_classes = num_classes
+        self.with_corner_loss = with_corner_loss
+        self.class_agnostic = class_agnostic
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_cls = build_loss(loss_cls)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        cls_out_channels = 1 if class_agnostic else num_classes
+        self.reg_out_channels = self.bbox_coder.code_size * cls_out_channels
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = cls_out_channels
+        else:
+            self.cls_out_channels = cls_out_channels + 1
+
+        self.dropout_ratio = dropout_ratio
+        self.grid_size = grid_size
+
+        # PVRCNNBBoxHead model in_channels is num of grid points in roi box.
+        in_channels *= (self.grid_size**3)
+
+        self.in_channels = in_channels
+
+        self.shared_fc_layer = self._make_fc_layers(
+            in_channels, shared_fc_channels,
+            range(len(shared_fc_channels) - 1), norm_cfg)
+        self.cls_layer = self._make_fc_layers(
+            shared_fc_channels[-1],
+            cls_channels,
+            range(1),
+            norm_cfg,
+            out_channels=self.cls_out_channels)
+        self.reg_layer = self._make_fc_layers(
+            shared_fc_channels[-1],
+            reg_channels,
+            range(1),
+            norm_cfg,
+            out_channels=self.reg_out_channels)
+
+    def _make_fc_layers(self,
+                        in_channels: int,
+                        fc_channels: list,
+                        dropout_indices: list,
+                        norm_cfg: dict,
+                        out_channels: Optional[int] = None) -> torch.nn.Module:
+        """Initial a full connection layer.
+
+        Args:
+            in_channels (int): Module in channels.
+            fc_channels (list): Full connection layer channels.
+            dropout_indices (list): Dropout indices.
+            norm_cfg (dict): Type of normalization method.
+            out_channels (int, optional): Module out channels.
+        """
+        fc_layers = []
+        pre_channel = in_channels
+        for k in range(len(fc_channels)):
+            fc_layers.append(
+                ConvModule(
+                    pre_channel,
+                    fc_channels[k],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    norm_cfg=norm_cfg,
+                    conv_cfg=dict(type='Conv2d'),
+                    bias=False,
+                    inplace=True))
+            pre_channel = fc_channels[k]
+            if self.dropout_ratio >= 0 and k in dropout_indices:
+                fc_layers.append(nn.Dropout(self.dropout_ratio))
+        if out_channels is not None:
+            fc_layers.append(
+                nn.Conv2d(fc_channels[-1], out_channels, 1, bias=True))
+        fc_layers = nn.Sequential(*fc_layers)
+        return fc_layers
+
+    def forward(self, feats: torch.Tensor) -> Tuple[torch.Tensor]:
+        """Forward pvrcnn bbox head.
+
+        Args:
+            feats (torch.Tensor): Batch point-wise features.
+
+        Returns:
+            tuple[torch.Tensor]: Score of class and bbox predictions.
+        """
+        # (B * N, 6, 6, 6, C)
+        rcnn_batch_size = feats.shape[0]
+        feats = feats.permute(0, 4, 1, 2,
+                              3).contiguous().view(rcnn_batch_size, -1, 1, 1)
+        # (BxN, C*6*6*6)
+        shared_feats = self.shared_fc_layer(feats)
+        cls_score = self.cls_layer(shared_feats).transpose(
+            1, 2).contiguous().view(-1, self.cls_out_channels)  # (B, 1)
+        bbox_pred = self.reg_layer(shared_feats).transpose(
+            1, 2).contiguous().view(-1, self.reg_out_channels)  # (B, C)
+        return cls_score, bbox_pred
+
+    def loss(self, cls_score: torch.Tensor, bbox_pred: torch.Tensor,
+             rois: torch.Tensor, labels: torch.Tensor,
+             bbox_targets: torch.Tensor, pos_gt_bboxes: torch.Tensor,
+             reg_mask: torch.Tensor, label_weights: torch.Tensor,
+             bbox_weights: torch.Tensor) -> Dict:
+        """Coumputing losses.
+
+        Args:
+            cls_score (torch.Tensor): Scores of each roi.
+            bbox_pred (torch.Tensor): Predictions of bboxes.
+            rois (torch.Tensor): Roi bboxes.
+            labels (torch.Tensor): Labels of class.
+            bbox_targets (torch.Tensor): Target of positive bboxes.
+            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
+            reg_mask (torch.Tensor): Mask for positive bboxes.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+
+        Returns:
+             dict: Computed losses.
+
+             - loss_cls (torch.Tensor): Loss of classes.
+             - loss_bbox (torch.Tensor): Loss of bboxes.
+             - loss_corner (torch.Tensor): Loss of corners.
+        """
+        losses = dict()
+        rcnn_batch_size = cls_score.shape[0]
+
+        # calculate class loss
+        cls_flat = cls_score.view(-1)
+        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
+        losses['loss_cls'] = loss_cls
+
+        # calculate regression loss
+        code_size = self.bbox_coder.code_size
+        pos_inds = (reg_mask > 0)
+        if pos_inds.any() == 0:
+            # fake a part loss
+            losses['loss_bbox'] = 0 * bbox_pred.sum()
+            if self.with_corner_loss:
+                losses['loss_corner'] = 0 * bbox_pred.sum()
+        else:
+            pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
+            bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
+                1, pos_bbox_pred.shape[-1])
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0),
+                bbox_weights_flat.unsqueeze(dim=0))
+            losses['loss_bbox'] = loss_bbox
+
+            if self.with_corner_loss:
+                pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
+                pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
+                batch_anchors = pos_roi_boxes3d.clone().detach()
+                pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
+                roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
+                batch_anchors[..., 0:3] = 0
+                # decode boxes
+                pred_boxes3d = self.bbox_coder.decode(
+                    batch_anchors,
+                    pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
+
+                pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
+                    pred_boxes3d[..., 0:3].unsqueeze(1),
+                    pos_rois_rotation,
+                    axis=2).squeeze(1)
+
+                pred_boxes3d[:, 0:3] += roi_xyz
+
+                # calculate corner loss
+                loss_corner = self.get_corner_loss_lidar(
+                    pred_boxes3d, pos_gt_bboxes)
+                losses['loss_corner'] = loss_corner.mean()
+
+        return losses
+
+    def get_targets(self,
+                    sampling_results: SamplingResult,
+                    rcnn_train_cfg: dict,
+                    concat: bool = True) -> Tuple[torch.Tensor]:
+        """Generate targets.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]):
+                Sampled results from rois.
+            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
+            concat (bool): Whether to concatenate targets between batches.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of boxes and class prediction.
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        iou_list = [res.iou for res in sampling_results]
+        targets = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            pos_gt_bboxes_list,
+            iou_list,
+            cfg=rcnn_train_cfg)
+
+        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+         bbox_weights) = targets
+
+        if concat:
+            label = torch.cat(label, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
+            reg_mask = torch.cat(reg_mask, 0)
+
+            label_weights = torch.cat(label_weights, 0)
+            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
+
+            bbox_weights = torch.cat(bbox_weights, 0)
+            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def _get_target_single(self, pos_bboxes: torch.Tensor,
+                           pos_gt_bboxes: torch.Tensor, ious: torch.Tensor,
+                           cfg: dict) -> Tuple[torch.Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            pos_bboxes (torch.Tensor): Positive boxes with shape
+                (N, 7).
+            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
+                (M, 7).
+            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
+                in shape (N, M).
+            cfg (dict): Training configs.
+
+        Returns:
+            tuple[torch.Tensor]: Target for positive boxes.
+                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+        """
+        cls_pos_mask = ious > cfg.cls_pos_thr
+        cls_neg_mask = ious < cfg.cls_neg_thr
+        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
+
+        # iou regression target
+        label = (cls_pos_mask > 0).float()
+        label[interval_mask] = ious[interval_mask] * 2 - 0.5
+        # label weights
+        label_weights = (label >= 0).float()
+
+        # box regression target
+        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
+        reg_mask[0:pos_gt_bboxes.size(0)] = 1
+        bbox_weights = (reg_mask > 0).float()
+        if reg_mask.bool().any():
+            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
+            roi_center = pos_bboxes[..., 0:3]
+            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
+
+            # canonical transformation
+            pos_gt_bboxes_ct[..., 0:3] -= roi_center
+            pos_gt_bboxes_ct[..., 6] -= roi_ry
+            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
+                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry,
+                axis=2).squeeze(1)
+
+            # flip orientation if rois have opposite orientation
+            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
+            opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
+            ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % (
+                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
+            flag = ry_label > np.pi
+            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
+            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
+            pos_gt_bboxes_ct[..., 6] = ry_label
+
+            rois_anchor = pos_bboxes.clone().detach()
+            rois_anchor[:, 0:3] = 0
+            rois_anchor[:, 6] = 0
+            bbox_targets = self.bbox_coder.encode(rois_anchor,
+                                                  pos_gt_bboxes_ct)
+        else:
+            # no fg bbox
+            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def get_corner_loss_lidar(self,
+                              pred_bbox3d: torch.Tensor,
+                              gt_bbox3d: torch.Tensor,
+                              delta: float = 1.0) -> torch.Tensor:
+        """Calculate corner loss of given boxes.
+
+        Args:
+            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
+            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
+            delta (float, optional): huber loss threshold. Defaults to 1.0
+
+        Returns:
+            torch.FloatTensor: Calculated corner loss in shape (N).
+        """
+        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
+
+        # This is a little bit hack here because we assume the box for
+        # Part-A2 is in LiDAR coordinates
+        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
+        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
+        gt_box_corners = gt_boxes_structure.corners
+
+        # This flip only changes the heading direction of GT boxes
+        gt_bbox3d_flip = gt_boxes_structure.clone()
+        gt_bbox3d_flip.tensor[:, 6] += np.pi
+        gt_box_corners_flip = gt_bbox3d_flip.corners
+
+        corner_dist = torch.min(
+            torch.norm(pred_box_corners - gt_box_corners, dim=2),
+            torch.norm(pred_box_corners - gt_box_corners_flip,
+                       dim=2))  # (N, 8)
+        # huber loss
+        abs_error = torch.abs(corner_dist)
+        corner_loss = torch.where(abs_error < delta,
+                                  0.5 * abs_error**2 / delta,
+                                  abs_error - 0.5 * delta)
+        return corner_loss.mean(dim=1)
+
+    def get_results(self,
+                    rois: torch.Tensor,
+                    cls_preds: torch.Tensor,
+                    bbox_reg: torch.Tensor,
+                    class_labels: torch.Tensor,
+                    input_metas: List[dict],
+                    test_cfg: dict = None) -> InstanceList:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            rois (torch.Tensor): Roi bounding boxes.
+            cls_preds (torch.Tensor): Scores of bounding boxes.
+            bbox_reg (torch.Tensor): Bounding boxes predictions
+            class_labels (torch.Tensor): Label of classes
+            input_metas (list[dict]): Point cloud meta info.
+            test_cfg (:obj:`ConfigDict`): Testing config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        roi_batch_id = rois[..., 0]
+        roi_boxes = rois[..., 1:]  # boxes without batch id
+        batch_size = int(roi_batch_id.max().item() + 1)
+
+        # decode boxes
+        roi_ry = roi_boxes[..., 6].view(-1)
+        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
+        local_roi_boxes = roi_boxes.clone().detach()
+        local_roi_boxes[..., 0:3] = 0
+        batch_box_preds = self.bbox_coder.decode(local_roi_boxes, bbox_reg)
+        batch_box_preds[..., 0:3] = rotation_3d_in_axis(
+            batch_box_preds[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
+        batch_box_preds[:, 0:3] += roi_xyz
+
+        # post processing
+        result_list = []
+        for batch_id in range(batch_size):
+            cur_cls_preds = cls_preds[roi_batch_id == batch_id]
+            box_preds = batch_box_preds[roi_batch_id == batch_id]
+            label_preds = class_labels[batch_id]
+
+            cur_cls_preds = cur_cls_preds.sigmoid()
+            cur_cls_preds, _ = torch.max(cur_cls_preds, dim=-1)
+            selected = self.class_agnostic_nms(
+                scores=cur_cls_preds,
+                bbox_preds=box_preds,
+                input_meta=input_metas[batch_id],
+                nms_cfg=test_cfg)
+
+            selected_bboxes = box_preds[selected]
+            selected_label_preds = label_preds[selected]
+            selected_scores = cur_cls_preds[selected]
+
+            results = InstanceData()
+            results.bboxes_3d = input_metas[batch_id]['box_type_3d'](
+                selected_bboxes, self.bbox_coder.code_size)
+            results.scores_3d = selected_scores
+            results.labels_3d = selected_label_preds
+
+            result_list.append(results)
+        return result_list
+
+    def class_agnostic_nms(self, scores: torch.Tensor,
+                           bbox_preds: torch.Tensor, nms_cfg: dict,
+                           input_meta: dict) -> Tuple[torch.Tensor]:
+        """Class agnostic NMS for box head.
+
+        Args:
+            scores (torch.Tensor): Object score of bounding boxes.
+            bbox_preds (torch.Tensor): Predicted bounding boxes.
+            nms_cfg (dict): NMS config dict.
+            input_meta (dict): Contain pcd and img's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        obj_scores = scores.clone()
+        if nms_cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        bbox = input_meta['box_type_3d'](
+            bbox_preds.clone(),
+            box_dim=bbox_preds.shape[-1],
+            with_yaw=True,
+            origin=(0.5, 0.5, 0.5))
+
+        if nms_cfg.score_thr is not None:
+            scores_mask = (obj_scores >= nms_cfg.score_thr)
+            obj_scores = obj_scores[scores_mask]
+            bbox = bbox[scores_mask]
+        selected = []
+        if obj_scores.shape[0] > 0:
+            box_scores_nms, indices = torch.topk(
+                obj_scores, k=min(4096, obj_scores.shape[0]))
+            bbox_bev = bbox.bev[indices]
+            bbox_for_nms = xywhr2xyxyr(bbox_bev)
+
+            keep = nms_func(bbox_for_nms, box_scores_nms, nms_cfg.nms_thr)
+            selected = indices[keep]
+        if nms_cfg.score_thr is not None:
+            original_idxs = scores_mask.nonzero().view(-1)
+            selected = original_idxs[selected]
+        return selected
--- a/mmdet3d/models/roi_heads/mask_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/mask_heads/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from .foreground_segmentation_head import ForegroundSegmentationHead
 from .pointwise_semantic_head import PointwiseSemanticHead
 from .primitive_head import PrimitiveHead

-__all__ = ['PointwiseSemanticHead', 'PrimitiveHead']
+__all__ = [
+    'PointwiseSemanticHead', 'PrimitiveHead', 'ForegroundSegmentationHead'
+]
--- a/mmdet3d/models/roi_heads/mask_heads/foreground_segmentation_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/foreground_segmentation_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
+
+import torch
+from mmcv.cnn.bricks import build_norm_layer
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import nn as nn
+
+from mmdet3d.models.builder import build_loss
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import InstanceList
+
+
+@MODELS.register_module()
+class ForegroundSegmentationHead(BaseModule):
+    """Foreground segmentation head.
+
+    Args:
+        in_channels (int): The number of input channel.
+        mlp_channels (tuple[int]): Specify of mlp channels. Defaults
+            to (256, 256).
+        extra_width (float): Boxes enlarge width. Default used 0.1.
+        norm_cfg (dict): Type of normalization method. Defaults to
+            dict(type='BN1d', eps=1e-5, momentum=0.1).
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+        loss_seg (dict): Config of segmentation loss. Defaults to
+            dict(type='mmdet.FocalLoss')
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        mlp_channels: Tuple[int] = (256, 256),
+        extra_width: float = 0.1,
+        norm_cfg: dict = dict(type='BN1d', eps=1e-5, momentum=0.1),
+        init_cfg: Optional[dict] = None,
+        loss_seg: dict = dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            gamma=2.0,
+            alpha=0.25,
+            activated=True,
+            loss_weight=1.0)
+    ) -> None:
+        super(ForegroundSegmentationHead, self).__init__(init_cfg=init_cfg)
+        self.extra_width = extra_width
+        self.num_classes = 1
+
+        self.in_channels = in_channels
+        self.use_sigmoid_cls = loss_seg.get('use_sigmoid', False)
+
+        out_channels = 1
+        if self.use_sigmoid_cls:
+            self.out_channels = out_channels
+        else:
+            self.out_channels = out_channels + 1
+
+        mlps_layers = []
+        cin = in_channels
+        for mlp in mlp_channels:
+            mlps_layers.extend([
+                nn.Linear(cin, mlp, bias=False),
+                build_norm_layer(norm_cfg, mlp)[1],
+                nn.ReLU()
+            ])
+            cin = mlp
+        mlps_layers.append(nn.Linear(cin, self.out_channels, bias=True))
+
+        self.seg_cls_layer = nn.Sequential(*mlps_layers)
+
+        self.loss_seg = build_loss(loss_seg)
+
+    def forward(self, feats: torch.Tensor) -> dict:
+        """Forward head.
+
+        Args:
+            feats (torch.Tensor): Point-wise features.
+
+        Returns:
+            dict: Segment predictions.
+        """
+        seg_preds = self.seg_cls_layer(feats)
+        return dict(seg_preds=seg_preds)
+
+    def _get_targets_single(self, point_xyz: torch.Tensor,
+                            gt_bboxes_3d: InstanceData,
+                            gt_labels_3d: torch.Tensor) -> torch.Tensor:
+        """generate segmentation targets for a single sample.
+
+        Args:
+            point_xyz (torch.Tensor): Coordinate of points.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
+                shape (box_num, 7).
+            gt_labels_3d (torch.Tensor): Class labels of ground truths in
+                shape (box_num).
+
+        Returns:
+            torch.Tensor: Points class labels.
+        """
+        point_cls_labels_single = point_xyz.new_zeros(
+            point_xyz.shape[0]).long()
+        enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
+
+        box_idxs_of_pts = gt_bboxes_3d.points_in_boxes_part(point_xyz).long()
+        extend_box_idxs_of_pts = enlarged_gt_boxes.points_in_boxes_part(
+            point_xyz).long()
+        box_fg_flag = box_idxs_of_pts >= 0
+        fg_flag = box_fg_flag.clone()
+        ignore_flag = fg_flag ^ (extend_box_idxs_of_pts >= 0)
+        point_cls_labels_single[ignore_flag] = -1
+        gt_box_of_fg_points = gt_labels_3d[box_idxs_of_pts[fg_flag]]
+        point_cls_labels_single[
+            fg_flag] = 1 if self.num_classes == 1 else\
+            gt_box_of_fg_points.long()
+        return point_cls_labels_single,
+
+    def get_targets(self, points_bxyz: torch.Tensor,
+                    batch_gt_instances_3d: InstanceList) -> dict:
+        """Generate segmentation targets.
+
+        Args:
+            points_bxyz (torch.Tensor): The coordinates of point in shape
+                (B, num_points, 3).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict: Prediction targets
+                - seg_targets (torch.Tensor): Segmentation targets.
+        """
+        batch_size = len(batch_gt_instances_3d)
+        points_xyz_list = []
+        gt_bboxes_3d = []
+        gt_labels_3d = []
+        for idx in range(batch_size):
+            coords_idx = points_bxyz[:, 0] == idx
+            points_xyz_list.append(points_bxyz[coords_idx][..., 1:])
+            gt_bboxes_3d.append(batch_gt_instances_3d[idx].bboxes_3d)
+            gt_labels_3d.append(batch_gt_instances_3d[idx].labels_3d)
+        seg_targets, = multi_apply(self._get_targets_single, points_xyz_list,
+                                   gt_bboxes_3d, gt_labels_3d)
+        seg_targets = torch.cat(seg_targets, dim=0)
+        return dict(seg_targets=seg_targets)
+
+    def loss(self, semantic_results: dict,
+             semantic_targets: dict) -> Dict[str, torch.Tensor]:
+        """Calculate point-wise segmentation losses.
+
+        Args:
+            semantic_results (dict): Results from semantic head.
+            semantic_targets (dict): Targets of semantic results.
+
+        Returns:
+            dict: Loss of segmentation.
+
+            - loss_semantic (torch.Tensor): Segmentation prediction loss.
+        """
+        seg_preds = semantic_results['seg_preds']
+        seg_targets = semantic_targets['seg_targets']
+
+        positives = (seg_targets > 0)
+
+        negative_cls_weights = (seg_targets == 0).float()
+        seg_weights = (negative_cls_weights + 1.0 * positives).float()
+        pos_normalizer = positives.sum(dim=0).float()
+        seg_weights /= torch.clamp(pos_normalizer, min=1.0)
+
+        seg_preds = torch.sigmoid(seg_preds)
+        loss_seg = self.loss_seg(seg_preds, (~positives).long(), seg_weights)
+        return dict(loss_semantic=loss_seg)
--- a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
@@ -2,6 +2,7 @@
 from typing import Dict, Optional, Tuple

 import torch
+from mmdet.models.utils import multi_apply
 from mmengine.model import BaseModule
 from torch import Tensor
 from torch import nn as nn
@@ -10,7 +11,6 @@ from torch.nn import functional as F
 from mmdet3d.registry import MODELS
 from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes, rotation_3d_in_axis
 from mmdet3d.utils import InstanceList
-from mmdet.models.utils import multi_apply


 @MODELS.register_module()