[Refactor] Refactor the model of VoxelNet and DynamicVoxelNet

db44cc50 · ZCMax · ChaimZhu · 7fda1f66 · db44cc50 · db44cc50
Commit db44cc50 authored Jun 21, 2022 by ZCMax Committed by ChaimZhu Jul 20, 2022
8 changed files
--- a/mmdet3d/models/__init__.py
+++ b/mmdet3d/models/__init__.py
@@ -7,6 +7,7 @@ from .builder import (BACKBONES, DETECTORS, FUSION_LAYERS, HEADS, LOSSES,
                      build_loss, build_middle_encoder, build_model,
                      build_neck, build_roi_extractor, build_shared_head,
                      build_voxel_encoder)
+from .data_preprocessors import *  # noqa: F401,F403
 from .decode_heads import *  # noqa: F401,F403
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403

--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .anchor3d_head import Anchor3DHead
 from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+from .base_3d_dense_head import Base3DDenseHead
 from .base_conv_bbox_head import BaseConvBboxHead
 from .base_mono3d_dense_head import BaseMono3DDenseHead
 from .centerpoint_head import CenterHead
@@ -21,5 +22,5 @@ __all__ = [
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
-    'MonoFlexHead'
+    'MonoFlexHead', 'Base3DDenseHead'
 ]
--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
-from typing import List, Optional, Tuple
+from typing import List, Tuple

 import numpy as np
 import torch
-from mmcv import ConfigDict
-from mmcv.runner import BaseModule, force_fp32
-from mmengine.data import InstanceData
 from torch import Tensor
 from torch import nn as nn

-from mmdet3d.core import (Det3DDataSample, PseudoSampler, box3d_multiclass_nms,
-                          limit_period, merge_aug_bboxes_3d, xywhr2xyxyr)
+from mmdet3d.core import PseudoSampler, merge_aug_bboxes_3d
+from mmdet3d.core.utils import ConfigType, InstanceList, OptConfigType
+from mmdet3d.core.utils.typing import OptInstanceList
 from mmdet3d.registry import MODELS, TASK_UTILS
 from mmdet.core import multi_apply
+from .base_3d_dense_head import Base3DDenseHead
 from .train_mixins import AnchorTrainMixin


 @MODELS.register_module()
-class Anchor3DHead(BaseModule, AnchorTrainMixin):
-    """Anchor head for SECOND/PointPillars/MVXNet/PartA2.
+class Anchor3DHead(Base3DDenseHead, AnchorTrainMixin):
+    """Anchor-based head for SECOND/PointPillars/MVXNet/PartA2.

    Args:
        num_classes (int): Number of classes.
        in_channels (int): Number of channels in the input feature map.
-        train_cfg (dict): Train configs.
-        test_cfg (dict): Test configs.
        feat_channels (int): Number of channels of the feature map.
        use_direction_classifier (bool): Whether to add a direction classifier.
        anchor_generator(dict): Config dict of anchor generator.
@@ -42,16 +39,17 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
        loss_cls (dict): Config of classification loss.
        loss_bbox (dict): Config of localization loss.
        loss_dir (dict): Config of direction classifier loss.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
    """

    def __init__(self,
                 num_classes: int,
                 in_channels: int,
-                 train_cfg: dict,
-                 test_cfg: dict,
                 feat_channels: int = 256,
                 use_direction_classifier: bool = True,
-                 anchor_generator: dict = dict(
+                 anchor_generator: ConfigType = dict(
                     type='Anchor3DRangeGenerator',
                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
                     strides=[2],
@@ -64,16 +62,20 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
                 diff_rad_by_sin: bool = True,
                 dir_offset: float = -np.pi / 2,
                 dir_limit_offset: int = 0,
-                 bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
-                 loss_cls: dict = dict(
-                     type='CrossEntropyLoss',
+                 bbox_coder: ConfigType = dict(type='DeltaXYZWLHRBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
                     use_sigmoid=True,
                     loss_weight=1.0),
-                 loss_bbox: dict = dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-                 loss_dir: dict = dict(
-                     type='CrossEntropyLoss', loss_weight=0.2),
-                 init_cfg: Optional[dict] = None) -> None:
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     loss_weight=2.0),
+                 loss_dir: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss', loss_weight=0.2),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None) -> None:
        super().__init__(init_cfg=init_cfg)
        self.in_channels = in_channels
        self.num_classes = num_classes
@@ -148,128 +150,53 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            self.conv_dir_cls = nn.Conv2d(self.feat_channels,
                                          self.num_anchors * 2, 1)

-    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
        """Forward function on a single-scale feature map.

        Args:
-            x (torch.Tensor): Input features.
+            x (Tensor): Features of a single scale level.

        Returns:
-            tuple[torch.Tensor]: Contain score of each class, bbox
-                regression and direction classification predictions.
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_base_priors * C.
+                dir_cls_pred (Tensor | None): Direction classification
+                    prediction for a single scale level, the channels
+                    number is num_base_priors * 2.
        """
        cls_score = self.conv_cls(x)
        bbox_pred = self.conv_reg(x)
-        dir_cls_preds = None
+        dir_cls_pred = None
        if self.use_direction_classifier:
-            dir_cls_preds = self.conv_dir_cls(x)
-        return cls_score, bbox_pred, dir_cls_preds
+            dir_cls_pred = self.conv_dir_cls(x)
+        return cls_score, bbox_pred, dir_cls_pred

-    def forward(self, feats: List[Tensor]) -> Tuple[list]:
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
        """Forward pass.

        Args:
-            feats (list[torch.Tensor]): Multi-level features, e.g.,
-                features produced by FPN.
-
-        Returns:
-            tuple[list[torch.Tensor]]: Multi-level class score, bbox
-                and direction predictions.
-        """
-        return multi_apply(self.forward_single, feats)
-
-    def forward_train(self,
-                      feats: List[Tensor],
-                      batch_data_samples: List[Det3DDataSample],
-                      proposal_cfg: Optional[ConfigDict] = None,
-                      **kwargs):
-        """
-        Args:
-            feats (list[Tensor]): Features from FPN.
-            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
-                contains the meta information of each sample and
-                corresponding annotations.
-            proposal_cfg (ConfigDict, optional): Test / postprocessing
-                configuration, if None, test_cfg would be used.
-                Defaults to None.
-
-        Returns:
-            tuple or Tensor: When `proposal_cfg` is None, the detector is a \
-            normal one-stage detector, The return value is the losses.
-
-            - losses: (dict[str, Tensor]): A dictionary of loss components.
-
-            When the `proposal_cfg` is not None, the head is used as a
-            `rpn_head`, the return value is a tuple contains:
-
-            - losses: (dict[str, Tensor]): A dictionary of loss components.
-            - results_list (list[:obj:`InstanceData`]): Detection
-              results of each input after the post process.
-              Each item usually contains following keys.Det3DDataSample
-
-                - scores_3d (Tensor): Classification scores, has a shape
-                  (num_instances, )
-                - labels_3d (Tensor): Labels of bboxes, has a shape
-                  (num_instances, ).
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
-                  contains a tensor with shape (num_instances, 7).
-        """
-        outs = self.forward(feats)
-
-        batch_gt_instance_3d = []
-        batch_gt_instances_ignore = []
-        batch_input_metas = []
-        for data_sample in batch_data_samples:
-            batch_input_metas.append(data_sample.metainfo)
-            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
-            if 'ignored_instances' in data_sample:
-                batch_gt_instances_ignore.append(data_sample.ignored_instances)
-            else:
-                batch_gt_instances_ignore.append(None)
-
-        loss_inputs = outs + (batch_gt_instance_3d, batch_input_metas)
-        losses = self.loss(
-            *loss_inputs, batch_gt_instances_ignore=batch_gt_instances_ignore)
-        if proposal_cfg is None:
-            return losses
-        else:
-            batch_img_metas = [
-                data_sample.metainfo for data_sample in batch_data_samples
-            ]
-            results_list = self.get_results(
-                *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
-            return losses, results_list
-
-    def simple_test(self,
-                    feats: Tuple[Tensor],
-                    batch_input_metas: List[dict],
-                    rescale: bool = False) -> List[InstanceData]:
-        """Test function without test-time augmentation.
-
-        Args:
-            feats (tuple[torch.Tensor]): Multi-level features from the
-                upstream network, each is a 4D-tensor.
-            batch_input_metas (list[dict]): List of image information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
+            x (tuple[Tensor]): Features from the upstream network,
+                each is a 4D-tensor.

        Returns:
-            list[:obj:`InstanceData`]: Detection results of each input
-            after the post process.
-            Each item usually contains following keys.
-
-            - scores_3d (Tensor): Classification scores, has a shape
-              (num_instances, )
-            - labels_3d (Tensor): Labels of bboxes, has a shape
-              (num_instances, ).
-            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
-              contains a tensor with shape (num_instances, 7).
+            tuple: A tuple of classification scores, bbox and direction
+                classification prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all
+                    scale levels, each is a 4D-tensor, the channels number
+                    is num_base_priors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all
+                    scale levels, each is a 4D-tensor, the channels number
+                    is num_base_priors * C.
+                - dir_cls_preds (list[Tensor|None]): Direction classification
+                    predictions for all scale levels, each is a 4D-tensor,
+                    the channels number is num_base_priors * 2.
        """
-        outs = self.forward(feats)
-        results_list = self.get_results(
-            *outs, input_metas=batch_input_metas, rescale=rescale)
-        return results_list
+        return multi_apply(self.forward_single, x)

+    # TODO: Support augmentation test
    def aug_test(self,
                 aug_batch_feats,
                 aug_batch_input_metas,
@@ -313,22 +240,24 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
        return anchor_list

-    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
-                    label_weights, bbox_targets, bbox_weights, dir_targets,
-                    dir_weights, num_total_samples):
+    def _loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                             dir_cls_pred: Tensor, labels: Tensor,
+                             label_weights: Tensor, bbox_targets: Tensor,
+                             bbox_weights: Tensor, dir_targets: Tensor,
+                             dir_weights: Tensor, num_total_samples: int):
        """Calculate loss of Single-level results.

        Args:
-            cls_score (torch.Tensor): Class score in single-level.
-            bbox_pred (torch.Tensor): Bbox prediction in single-level.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
+            cls_score (Tensor): Class score in single-level.
+            bbox_pred (Tensor): Bbox prediction in single-level.
+            dir_cls_pred (Tensor): Predictions of direction class
                in single-level.
-            labels (torch.Tensor): Labels of class.
-            label_weights (torch.Tensor): Weights of class loss.
-            bbox_targets (torch.Tensor): Targets of bbox predictions.
-            bbox_weights (torch.Tensor): Weights of bbox loss.
-            dir_targets (torch.Tensor): Targets of direction predictions.
-            dir_weights (torch.Tensor): Weights of direction loss.
+            labels (Tensor): Labels of class.
+            label_weights (Tensor): Weights of class loss.
+            bbox_targets (Tensor): Targets of bbox predictions.
+            bbox_weights (Tensor): Weights of bbox loss.
+            dir_targets (Tensor): Targets of direction predictions.
+            dir_weights (Tensor): Weights of direction loss.
            num_total_samples (int): The number of valid samples.

        Returns:
@@ -363,10 +292,10 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):

        # dir loss
        if self.use_direction_classifier:
-            dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2)
+            dir_cls_pred = dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
            dir_targets = dir_targets.reshape(-1)
            dir_weights = dir_weights.reshape(-1)
-            pos_dir_cls_preds = dir_cls_preds[pos_inds]
+            pos_dir_cls_pred = dir_cls_pred[pos_inds]
            pos_dir_targets = dir_targets[pos_inds]
            pos_dir_weights = dir_weights[pos_inds]

@@ -388,14 +317,14 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            loss_dir = None
            if self.use_direction_classifier:
                loss_dir = self.loss_dir(
-                    pos_dir_cls_preds,
+                    pos_dir_cls_pred,
                    pos_dir_targets,
                    pos_dir_weights,
                    avg_factor=num_total_samples)
        else:
            loss_bbox = pos_bbox_pred.sum()
            if self.use_direction_classifier:
-                loss_dir = pos_dir_cls_preds.sum()
+                loss_dir = pos_dir_cls_pred.sum()

        return loss_cls, loss_bbox, loss_dir

@@ -423,15 +352,16 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
                           dim=-1)
        return boxes1, boxes2

-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def loss(self,
-             cls_scores: List[Tensor],
-             bbox_preds: List[Tensor],
-             dir_cls_preds: List[Tensor],
-             batch_gt_instances_3d: List[InstanceData],
-             batch_input_metas: List[dict],
-             batch_gt_instances_ignore: List[InstanceData] = None) -> dict:
-        """Calculate losses.
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_input_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.

        Args:
            cls_scores (list[torch.Tensor]): Multi-level class scores.
@@ -481,7 +411,7 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):

        # num_total_samples = None
        losses_cls, losses_bbox, losses_dir = multi_apply(
-            self.loss_single,
+            self._loss_by_feat_single,
            cls_scores,
            bbox_preds,
            dir_cls_preds,
@@ -494,165 +424,3 @@ class Anchor3DHead(BaseModule, AnchorTrainMixin):
            num_total_samples=num_total_samples)
        return dict(
            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
-
-    def get_results(self,
-                    cls_scores: List[Tensor],
-                    bbox_preds: List[Tensor],
-                    dir_cls_preds: List[Tensor],
-                    input_metas: List[dict],
-                    cfg: ConfigDict = None,
-                    rescale: list = False) -> List[InstanceData]:
-        """Get results of anchor head.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Multi-level class scores.
-            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
-            dir_cls_preds (list[torch.Tensor]): Multi-level direction
-                class predictions.
-            input_metas (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            rescale (list[torch.Tensor]): Whether th rescale bbox.
-
-        Returns:
-            list[:obj:`InstanceData`]: Instance prediction
-            results of each sample after the post process.
-            Each item usually contains following keys.
-
-                - scores_3d (Tensor): Classification scores, has a shape
-                  (num_instance, )
-                - labels_3d (Tensor): Labels of bboxes, has a shape
-                  (num_instances, ).
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
-                    contains a tensor with shape (num_instances, 7).
-        """
-        assert len(cls_scores) == len(bbox_preds)
-        assert len(cls_scores) == len(dir_cls_preds)
-        num_levels = len(cls_scores)
-        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
-        device = cls_scores[0].device
-        mlvl_anchors = self.prior_generator.grid_anchors(
-            featmap_sizes, device=device)
-        mlvl_anchors = [
-            anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors
-        ]
-
-        result_list = []
-        for img_id in range(len(input_metas)):
-            cls_score_list = [
-                cls_scores[i][img_id].detach() for i in range(num_levels)
-            ]
-            bbox_pred_list = [
-                bbox_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-            dir_cls_pred_list = [
-                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-
-            input_meta = input_metas[img_id]
-            proposals = self._get_results_single(cls_score_list,
-                                                 bbox_pred_list,
-                                                 dir_cls_pred_list,
-                                                 mlvl_anchors, input_meta, cfg,
-                                                 rescale)
-            result_list.append(proposals)
-        return result_list
-
-    def _get_results_single(self,
-                            cls_scores: Tensor,
-                            bbox_preds: Tensor,
-                            dir_cls_preds: Tensor,
-                            mlvl_anchors: List[Tensor],
-                            input_meta: List[dict],
-                            cfg: ConfigDict = None,
-                            rescale: bool = False) -> InstanceData:
-        """Get results of single branch.
-
-        Args:
-            cls_scores (torch.Tensor): Class score in single batch.
-            bbox_preds (torch.Tensor): Bbox prediction in single batch.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
-                in single batch.
-            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
-                in single batch.
-            input_meta (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            rescale (list[torch.Tensor]): whether th rescale bbox.
-
-        Returns:
-            :obj:`InstanceData`: Detection results of each sample
-            after the post process.
-            Each item usually contains following keys.
-
-                - scores_3d (Tensor): Classification scores, has a shape
-                  (num_instance, )
-                - labels_3d (Tensor): Labels of bboxes, has a shape
-                  (num_instances, ).
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
-                    contains a tensor with shape (num_instances, 7).
-        """
-        cfg = self.test_cfg if cfg is None else cfg
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
-        mlvl_bboxes = []
-        mlvl_scores = []
-        mlvl_dir_scores = []
-        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
-                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
-            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
-            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
-            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
-            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
-
-            cls_score = cls_score.permute(1, 2,
-                                          0).reshape(-1, self.num_classes)
-            if self.use_sigmoid_cls:
-                scores = cls_score.sigmoid()
-            else:
-                scores = cls_score.softmax(-1)
-            bbox_pred = bbox_pred.permute(1, 2,
-                                          0).reshape(-1, self.box_code_size)
-
-            nms_pre = cfg.get('nms_pre', -1)
-            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                if self.use_sigmoid_cls:
-                    max_scores, _ = scores.max(dim=1)
-                else:
-                    max_scores, _ = scores[:, :-1].max(dim=1)
-                _, topk_inds = max_scores.topk(nms_pre)
-                anchors = anchors[topk_inds, :]
-                bbox_pred = bbox_pred[topk_inds, :]
-                scores = scores[topk_inds, :]
-                dir_cls_score = dir_cls_score[topk_inds]
-
-            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
-            mlvl_bboxes.append(bboxes)
-            mlvl_scores.append(scores)
-            mlvl_dir_scores.append(dir_cls_score)
-
-        mlvl_bboxes = torch.cat(mlvl_bboxes)
-        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            mlvl_bboxes, box_dim=self.box_code_size).bev)
-        mlvl_scores = torch.cat(mlvl_scores)
-        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
-
-        if self.use_sigmoid_cls:
-            # Add a dummy background class to the front when using sigmoid
-            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
-            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
-
-        score_thr = cfg.get('score_thr', 0)
-        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
-                                       mlvl_scores, score_thr, cfg.max_num,
-                                       cfg, mlvl_dir_scores)
-        bboxes, scores, labels, dir_scores = results
-        if bboxes.shape[0] > 0:
-            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
-                                   self.dir_limit_offset, np.pi)
-            bboxes[..., 6] = (
-                dir_rot + self.dir_offset +
-                np.pi * dir_scores.to(bboxes.dtype))
-        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
-        results = InstanceData()
-        results.bboxes_3d = bboxes
-        results.scores_3d = scores
-        results.labels_3d = labels
-        return results
--- a/mmdet3d/models/dense_heads/base_3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_3d_dense_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn.utils.weight_init import constant_init
+from mmengine.config import ConfigDict
+from mmengine.data import InstanceData
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
+from mmdet3d.core.utils import InstanceList, OptMultiConfig, SampleList
+from mmdet.core.utils import select_single_mlvl
+
+
+class Base3DDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for 3D DenseHeads.
+
+    1. The ``init_weights`` method is used to initialize densehead's
+    model parameters. After detector initialization, ``init_weights``
+    is triggered when ``detector.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    4. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(x)
+
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(self,
+                         x: Tuple[Tensor],
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[ConfigDict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and
+                `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        outs = self(x)
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        batch_input_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_input_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_anchors(
+            featmap_sizes, device=cls_scores[0].device)
+        mlvl_priors = [
+            prior.reshape(-1, self.box_code_size) for prior in mlvl_priors
+        ]
+
+        result_list = []
+
+        for input_id in range(len(batch_input_metas)):
+
+            input_meta = batch_input_metas[input_id]
+            cls_score_list = select_single_mlvl(cls_scores, input_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, input_id)
+            dir_cls_pred_list = select_single_mlvl(dir_cls_preds, input_id)
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                dir_cls_pred_list=dir_cls_pred_list,
+                mlvl_priors=mlvl_priors,
+                input_meta=input_meta,
+                cfg=cfg,
+                rescale=rescale,
+                **kwargs)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                input_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                **kwargs) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single point cloud sample, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single point cloud sample, each item
+                has shape (num_priors * C, H, W).
+            dir_cls_pred_list (list[Tensor]): Predictions of direction class
+                from all scale levels of a single point cloud sample, each
+                item has shape (num_priors * 2, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            input_meta (dict): Contain point clouds and image meta info.
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, priors in zip(
+                cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode(priors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+
+        return results
+
+    # TODO: Support augmentation test
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_input_metas,
+                 rescale=False,
+                 with_ori_nms=False,
+                 **kwargs):
+        pass
--- a/mmdet3d/models/detectors/dynamic_voxelnet.py
+++ b/mmdet3d/models/detectors/dynamic_voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
 import torch
 from mmcv.runner import force_fp32
+from torch import Tensor
 from torch.nn import functional as F

+from mmdet3d.core.utils import ConfigType, OptConfigType, OptMultiConfig
 from mmdet3d.registry import MODELS
 from .voxelnet import VoxelNet

@@ -13,17 +17,17 @@ class DynamicVoxelNet(VoxelNet):
    """

    def __init__(self,
-                 voxel_layer,
-                 voxel_encoder,
-                 middle_encoder,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(DynamicVoxelNet, self).__init__(
+                 voxel_layer: ConfigType,
+                 voxel_encoder: ConfigType,
+                 middle_encoder: ConfigType,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
            voxel_layer=voxel_layer,
            voxel_encoder=voxel_encoder,
            middle_encoder=middle_encoder,
@@ -32,30 +36,19 @@ class DynamicVoxelNet(VoxelNet):
            bbox_head=bbox_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            pretrained=pretrained,
+            data_preprocessor=data_preprocessor,
            init_cfg=init_cfg)

-    def extract_feat(self, points, img_metas):
-        """Extract features from points."""
-        voxels, coors = self.voxelize(points)
-        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
    @torch.no_grad()
    @force_fp32()
-    def voxelize(self, points):
+    def voxelize(self, points: List[torch.Tensor]) -> tuple:
        """Apply dynamic voxelization to points.

        Args:
-            points (list[torch.Tensor]): Points of each sample.
+            points (list[Tensor]): Points of each sample.

        Returns:
-            tuple[torch.Tensor]: Concatenated points and coordinates.
+            tuple[Tensor]: Concatenated points and coordinates.
        """
        coors = []
        # dynamic voxelization only provide a coors mapping
@@ -69,3 +62,16 @@ class DynamicVoxelNet(VoxelNet):
            coors_batch.append(coor_pad)
        coors_batch = torch.cat(coors_batch, dim=0)
        return points, coors_batch
+
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features from points."""
+        # TODO: Remove voxelization to datapreprocessor
+        points = batch_inputs_dict['points']
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
--- a/mmdet3d/models/detectors/single_stage.py
+++ b/mmdet3d/models/detectors/single_stage.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union

 import torch
+from torch import Tensor

 from mmdet3d.core.utils import (ConfigType, OptConfigType, OptMultiConfig,
                                OptSampleList, SampleList)
@@ -134,12 +135,19 @@ class SingleStage3DDetector(Base3DDetector):
        results = self.bbox_head.forward(x)
        return results

-    def extract_feat(self,
-                     batch_inputs_dict: torch.Tensor) -> Tuple[torch.Tensor]:
+    def extract_feat(
+        self, batch_inputs_dict: torch.Tensor
+    ) -> Union[Tuple[torch.Tensor], Dict[str, Tensor]]:
        """Directly extract features from the backbone+neck.

        Args:
            points (torch.Tensor): Input points.
+
+        Returns:
+            tuple[Tensor] | dict:  For outside 3D object detection, we
+                typically obtain a tuple of features from the backbone + neck,
+                and for inside 3D object detection, usually a dict containing
+                features will be obtained.
        """
        points = batch_inputs_dict['points']
        stack_points = torch.stack(points)

--- a/mmdet3d/models/detectors/voxelnet.py
+++ b/mmdet3d/models/detectors/voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
+from typing import List, Tuple

 import torch
 from mmcv.ops import Voxelization
 from mmcv.runner import force_fp32
+from torch import Tensor
 from torch.nn import functional as F

-from mmdet3d.core import Det3DDataSample
+from mmdet3d.core.utils import ConfigType, OptConfigType, OptMultiConfig
 from mmdet3d.registry import MODELS
 from .single_stage import SingleStage3DDetector

@@ -16,39 +17,28 @@ class VoxelNet(SingleStage3DDetector):
    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""

    def __init__(self,
-                 voxel_layer: dict,
-                 voxel_encoder: dict,
-                 middle_encoder: dict,
-                 backbone: dict,
-                 neck: Optional[dict] = None,
-                 bbox_head: Optional[dict] = None,
-                 train_cfg: Optional[dict] = None,
-                 test_cfg: Optional[dict] = None,
-                 init_cfg: Optional[dict] = None,
-                 pretrained: Optional[str] = None) -> None:
-        super(VoxelNet, self).__init__(
+                 voxel_layer: ConfigType,
+                 voxel_encoder: ConfigType,
+                 middle_encoder: ConfigType,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
            backbone=backbone,
            neck=neck,
            bbox_head=bbox_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            init_cfg=init_cfg,
-            pretrained=pretrained)
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
        self.voxel_layer = Voxelization(**voxel_layer)
        self.voxel_encoder = MODELS.build(voxel_encoder)
        self.middle_encoder = MODELS.build(middle_encoder)

-    def extract_feat(self, points: List[torch.Tensor]) -> list:
-        """Extract features from points."""
-        voxels, num_points, coors = self.voxelize(points)
-        voxel_features = self.voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, coors, batch_size)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
    @torch.no_grad()
    @force_fp32()
    def voxelize(self, points: List[torch.Tensor]) -> tuple:
@@ -68,75 +58,15 @@ class VoxelNet(SingleStage3DDetector):
        coors_batch = torch.cat(coors_batch, dim=0)
        return voxels, num_points, coors_batch

-    def forward_train(self, batch_inputs_dict: Dict[list, torch.Tensor],
-                      batch_data_samples: List[Det3DDataSample],
-                      **kwargs) -> dict:
-        """
-        Args:
-            batch_inputs_dict (dict): The model input dict. It should contain
-                ``points`` and ``img`` keys.
-
-                    - points (list[torch.Tensor]): Point cloud of each sample.
-                    - imgs (torch.Tensor, optional): Image of each sample.
-
-            batch_data_samples (list[:obj:`Det3DDataSample`]): The batch
-                data samples. It usually includes information such
-                as `gt_instance_3d` or `gt_panoptic_seg_3d` or `gt_sem_seg_3d`.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-
-        x = self.extract_feat(batch_inputs_dict['points'])
-        losses = self.bbox_head.forward_train(x, batch_data_samples, **kwargs)
-        return losses
-
-    def simple_test(self,
-                    batch_inputs_dict: Dict[list, torch.Tensor],
-                    batch_input_metas: List[dict],
-                    rescale: bool = False) -> list:
-        """Test function without test-time augmentation.
-
-        Args:
-            batch_inputs_dict (dict): The model input dict. It should contain
-                ``points`` and ``img`` keys.
-
-                    - points (list[torch.Tensor]): Point cloud of single
-                        sample.
-                    - imgs (torch.Tensor, optional): Image of single sample.
-
-            batch_input_metas (list[dict]): List of input information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
-
-        Returns:
-            list[:obj:`Det3DDataSample`]: Detection results of the \
-                inputs. Each Det3DDataSample usually contain \
-                'pred_instances_3d'. And the ``pred_instances_3d`` usually \
-                contains following keys.
-
-                - scores_3d (Tensor): Classification scores, has a shape
-                    (num_instances, )
-                - labels_3d (Tensor): Labels of bboxes, has a shape
-                    (num_instances, ).
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
-                    contains a tensor with shape (num_instances, 7).
-        """
-        x = self.extract_feat(batch_inputs_dict['points'])
-        bboxes_list = self.bbox_head.simple_test(
-            x, batch_input_metas, rescale=rescale)
-
-        # connvert to Det3DDataSample
-        results_list = self.postprocess_result(bboxes_list)
-        return results_list
-
-    def aug_test(self,
-                 aug_batch_inputs_dict: Dict[list, torch.Tensor],
-                 aug_batch_input_metas: List[dict],
-                 rescale: bool = False) -> list:
-        """Test function with augmentaiton."""
-        # TODO Refactor this after mmdet update
-        feats = self.extract_feats(aug_batch_inputs_dict)
-        aug_bboxes = self.bbox_head.aug_test(
-            feats, aug_batch_input_metas, rescale=rescale)
-        return aug_bboxes
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features from points."""
+        # TODO: Remove voxelization to datapreprocessor
+        points = batch_inputs_dict['points']
+        voxels, num_points, coors = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
--- a/tests/test_models/test_dense_heads/test_anchor3d_head.py
+++ b/tests/test_models/test_dense_heads/test_anchor3d_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine import Config
+from mmengine.data import InstanceData
+
+from mmdet3d import *  # noqa
+from mmdet3d.core.bbox import Box3DMode, LiDARInstance3DBoxes
+from mmdet3d.models.dense_heads import Anchor3DHead
+
+
+class TestAnchor3DHead(TestCase):
+
+    def test_anchor3d_head_loss(self):
+        """Test anchor head loss when truth is empty and non-empty."""
+
+        cfg = Config(
+            dict(
+                assigner=[
+                    dict(  # for Pedestrian
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.35,
+                        neg_iou_thr=0.2,
+                        min_pos_iou=0.2,
+                        ignore_iof_thr=-1),
+                    dict(  # for Cyclist
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.35,
+                        neg_iou_thr=0.2,
+                        min_pos_iou=0.2,
+                        ignore_iof_thr=-1),
+                    dict(  # for Car
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.6,
+                        neg_iou_thr=0.45,
+                        min_pos_iou=0.45,
+                        ignore_iof_thr=-1),
+                ],
+                allowed_border=0,
+                pos_weight=-1,
+                debug=False))
+
+        anchor3d_head = Anchor3DHead(
+            num_classes=3,
+            in_channels=512,
+            feat_channels=512,
+            use_direction_classifier=True,
+            anchor_generator=dict(
+                type='Anchor3DRangeGenerator',
+                ranges=[
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+                ],
+                sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+                rotations=[0, 1.57],
+                reshape_out=False),
+            diff_rad_by_sin=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.2),
+            train_cfg=cfg)
+
+        # Anchor head expects a multiple levels of features per image
+        feats = (torch.rand([1, 512, 200, 176], dtype=torch.float32), )
+        (cls_scores, bbox_preds, dir_cls_preds) = anchor3d_head.forward(feats)
+
+        self.assertEqual(cls_scores[0].shape, torch.Size([1, 18, 200, 176]))
+        self.assertEqual(bbox_preds[0].shape, torch.Size([1, 42, 200, 176]))
+        self.assertEqual(dir_cls_preds[0].shape, torch.Size([1, 12, 200, 176]))
+
+        # # Test that empty ground truth encourages the network to
+        # # predict background
+        gt_instances = InstanceData()
+        gt_bboxes_3d = LiDARInstance3DBoxes(torch.empty((0, 7)))
+        gt_labels_3d = torch.tensor([])
+        input_metas = dict(sample_idx=1234)
+        # fake input_metas
+        gt_instances.bboxes_3d = gt_bboxes_3d
+        gt_instances.labels_3d = gt_labels_3d
+
+        empty_gt_losses = anchor3d_head.loss_by_feat(cls_scores, bbox_preds,
+                                                     dir_cls_preds,
+                                                     [gt_instances],
+                                                     [input_metas])
+
+        # When there is no truth, the cls loss should be nonzero but
+        # there should be no box and dir loss.
+        self.assertGreater(empty_gt_losses['loss_cls'][0], 0,
+                           'cls loss should be non-zero')
+        self.assertEqual(
+            empty_gt_losses['loss_bbox'][0], 0,
+            'there should be no box loss when there are no true boxes')
+        self.assertEqual(
+            empty_gt_losses['loss_dir'][0], 0,
+            'there should be no dir loss when there are no true dirs')
+
+        # When truth is non-empty then both cls and box loss
+        # should be nonzero for random inputs
+        gt_instances = InstanceData()
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            torch.tensor(
+                [[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, -0.9091]],
+                dtype=torch.float32))
+        gt_labels_3d = torch.tensor([1], dtype=torch.int64)
+        gt_instances.bboxes_3d = gt_bboxes_3d
+        gt_instances.labels_3d = gt_labels_3d
+
+        gt_losses = anchor3d_head.loss_by_feat(cls_scores, bbox_preds,
+                                               dir_cls_preds, [gt_instances],
+                                               [input_metas])
+
+        self.assertGreater(gt_losses['loss_cls'][0], 0,
+                           'cls loss should be non-zero')
+        self.assertGreater(gt_losses['loss_bbox'][0], 0,
+                           'box loss should be non-zero')
+        self.assertGreater(gt_losses['loss_dir'][0], 0,
+                           'dir loss should be none-zero')
+
+    def test_anchor3d_head_predict(self):
+
+        cfg = Config(
+            dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_thr=0.01,
+                score_thr=0.1,
+                min_bbox_size=0,
+                nms_pre=100,
+                max_num=50))
+
+        anchor3d_head = Anchor3DHead(
+            num_classes=3,
+            in_channels=512,
+            feat_channels=512,
+            use_direction_classifier=True,
+            anchor_generator=dict(
+                type='Anchor3DRangeGenerator',
+                ranges=[
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+                ],
+                sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+                rotations=[0, 1.57],
+                reshape_out=False),
+            diff_rad_by_sin=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.2),
+            test_cfg=cfg)
+
+        feats = (torch.rand([2, 512, 200, 176], dtype=torch.float32), )
+        (cls_scores, bbox_preds, dir_cls_preds) = anchor3d_head.forward(feats)
+        # fake input_metas
+        input_metas = [{
+            'sample_idx': 1234,
+            'box_type_3d': LiDARInstance3DBoxes,
+            'box_mode_3d': Box3DMode.LIDAR
+        }, {
+            'sample_idx': 2345,
+            'box_type_3d': LiDARInstance3DBoxes,
+            'box_mode_3d': Box3DMode.LIDAR
+        }]
+        # test get_boxes
+        cls_scores[0] -= 1.5  # too many positive samples may cause cuda oom
+        results = anchor3d_head.predict_by_feat(cls_scores, bbox_preds,
+                                                dir_cls_preds, input_metas)
+        pred_instances = results[0]
+        scores_3d = pred_instances.scores_3d
+
+        assert (scores_3d > 0.3).all()