[Refactor] Refactor the model of VoxelNet and DynamicVoxelNet

db44cc50 · ZCMax · ChaimZhu · 7fda1f66 · db44cc50 · db44cc50
Commit db44cc50 authored Jun 21, 2022 by ZCMax Committed by ChaimZhu Jul 20, 2022
8 changed files
--- a/mmdet3d/models/__init__.py
+++ b/mmdet3d/models/__init__.py
@@ -7,6 +7,7 @@ from .builder import (BACKBONES, DETECTORS, FUSION_LAYERS, HEADS, LOSSES,
                      build_loss, build_middle_encoder, build_model,
                      build_neck, build_roi_extractor, build_shared_head,
                      build_voxel_encoder)
+from .data_preprocessors import *  # noqa: F401,F403
 from .decode_heads import *  # noqa: F401,F403
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403

--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .anchor3d_head import Anchor3DHead
 from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+from .base_3d_dense_head import Base3DDenseHead
 from .base_conv_bbox_head import BaseConvBboxHead
 from .base_mono3d_dense_head import BaseMono3DDenseHead
 from .centerpoint_head import CenterHead
@@ -21,5 +22,5 @@ __all__ = [
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
-    'MonoFlexHead'
+    'MonoFlexHead', 'Base3DDenseHead'
 ]
--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
--- a/mmdet3d/models/dense_heads/base_3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_3d_dense_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from mmcv.cnn.utils.weight_init import constant_init
+from mmengine.config import ConfigDict
+from mmengine.data import InstanceData
+from mmengine.model import BaseModule
+from torch import Tensor
+from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
+from mmdet3d.core.utils import InstanceList, OptMultiConfig, SampleList
+from mmdet.core.utils import select_single_mlvl
+class Base3DDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for 3D DenseHeads.
+    1. The ``init_weights`` method is used to initialize densehead's
+    model parameters. After detector initialization, ``init_weights``
+    is triggered when ``detector.init_weights()`` is called externally.
+    2. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+    .. code:: text
+    loss(): forward() -> loss_by_feat()
+    3. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+    .. code:: text
+    predict(): forward() -> predict_by_feat()
+    4. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+    .. code:: text
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(x)
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+        loss_inputs = outs + (batch_gt_instances_3d, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+    def loss_and_predict(self,
+                         x: Tuple[Tensor],
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[ConfigDict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+        Returns:
+            tuple: the return value is a tuple contains:
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+        outs = self(x)
+        loss_inputs = outs + (batch_gt_instances, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, cfg=proposal_cfg)
+        return losses, predictions
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and
+                `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        outs = self(x)
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, rescale=rescale)
+        return predictions
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        batch_input_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_input_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_anchors(
+            featmap_sizes, device=cls_scores[0].device)
+        mlvl_priors = [
+            prior.reshape(-1, self.box_code_size) for prior in mlvl_priors
+        ]
+        result_list = []
+        for input_id in range(len(batch_input_metas)):
+            input_meta = batch_input_metas[input_id]
+            cls_score_list = select_single_mlvl(cls_scores, input_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, input_id)
+            dir_cls_pred_list = select_single_mlvl(dir_cls_preds, input_id)
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                dir_cls_pred_list=dir_cls_pred_list,
+                mlvl_priors=mlvl_priors,
+                input_meta=input_meta,
+                cfg=cfg,
+                rescale=rescale,
+                **kwargs)
+            result_list.append(results)
+        return result_list
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                input_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                **kwargs) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single point cloud sample, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single point cloud sample, each item
+                has shape (num_priors * C, H, W).
+            dir_cls_pred_list (list[Tensor]): Predictions of direction class
+                from all scale levels of a single point cloud sample, each
+                item has shape (num_priors * 2, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            input_meta (dict): Contain point clouds and image meta info.
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, priors in zip(
+                cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+            bboxes = self.bbox_coder.decode(priors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+    # TODO: Support augmentation test
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_input_metas,
+                 rescale=False,
+                 with_ori_nms=False,
+                 **kwargs):
+        pass
--- a/mmdet3d/models/detectors/dynamic_voxelnet.py
+++ b/mmdet3d/models/detectors/dynamic_voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
 import torch
 from mmcv.runner import force_fp32
+from torch import Tensor
 from torch.nn import functional as F
+from mmdet3d.core.utils import ConfigType, OptConfigType, OptMultiConfig
 from mmdet3d.registry import MODELS
 from .voxelnet import VoxelNet
@@ -13,17 +17,17 @@ class DynamicVoxelNet(VoxelNet):
    """
    def __init__(self,
-                 voxel_layer,
+                 voxel_layer: ConfigType,
-                 voxel_encoder,
+                 voxel_encoder: ConfigType,
-                 middle_encoder,
+                 middle_encoder: ConfigType,
-                 backbone,
+                 backbone: ConfigType,
-                 neck=None,
+                 neck: OptConfigType = None,
-                 bbox_head=None,
+                 bbox_head: OptConfigType = None,
-                 train_cfg=None,
+                 train_cfg: OptConfigType = None,
-                 test_cfg=None,
+                 test_cfg: OptConfigType = None,
-                 pretrained=None,
+                 data_preprocessor: OptConfigType = None,
-                 init_cfg=None):
+                 init_cfg: OptMultiConfig = None) -> None:
-        super(DynamicVoxelNet, self).__init__(
+        super().__init__(
            voxel_layer=voxel_layer,
            voxel_encoder=voxel_encoder,
            middle_encoder=middle_encoder,
@@ -32,30 +36,19 @@ class DynamicVoxelNet(VoxelNet):
            bbox_head=bbox_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            pretrained=pretrained,
+            data_preprocessor=data_preprocessor,
            init_cfg=init_cfg)
-    def extract_feat(self, points, img_metas):
-        """Extract features from points."""
-        voxels, coors = self.voxelize(points)
-        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
    @torch.no_grad()
    @force_fp32()
-    def voxelize(self, points):
+    def voxelize(self, points: List[torch.Tensor]) -> tuple:
        """Apply dynamic voxelization to points.
        Args:
-            points (list[torch.Tensor]): Points of each sample.
+            points (list[Tensor]): Points of each sample.
        Returns:
-            tuple[torch.Tensor]: Concatenated points and coordinates.
+            tuple[Tensor]: Concatenated points and coordinates.
        """
        coors = []
        # dynamic voxelization only provide a coors mapping
@@ -69,3 +62,16 @@ class DynamicVoxelNet(VoxelNet):
            coors_batch.append(coor_pad)
        coors_batch = torch.cat(coors_batch, dim=0)
        return points, coors_batch
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features from points."""
+        # TODO: Remove voxelization to datapreprocessor
+        points = batch_inputs_dict['points']
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
--- a/mmdet3d/models/detectors/single_stage.py
+++ b/mmdet3d/models/detectors/single_stage.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 import torch
+from torch import Tensor
 from mmdet3d.core.utils import (ConfigType, OptConfigType, OptMultiConfig,
                                OptSampleList, SampleList)
@@ -134,12 +135,19 @@ class SingleStage3DDetector(Base3DDetector):
        results = self.bbox_head.forward(x)
        return results
-    def extract_feat(self,
+    def extract_feat(
-                     batch_inputs_dict: torch.Tensor) -> Tuple[torch.Tensor]:
+        self, batch_inputs_dict: torch.Tensor
+    ) -> Union[Tuple[torch.Tensor], Dict[str, Tensor]]:
        """Directly extract features from the backbone+neck.
        Args:
            points (torch.Tensor): Input points.
+        Returns:
+            tuple[Tensor] | dict:  For outside 3D object detection, we
+                typically obtain a tuple of features from the backbone + neck,
+                and for inside 3D object detection, usually a dict containing
+                features will be obtained.
        """
        points = batch_inputs_dict['points']
        stack_points = torch.stack(points)

--- a/mmdet3d/models/detectors/voxelnet.py
+++ b/mmdet3d/models/detectors/voxelnet.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
+from typing import List, Tuple
 import torch
 from mmcv.ops import Voxelization
 from mmcv.runner import force_fp32
+from torch import Tensor
 from torch.nn import functional as F
-from mmdet3d.core import Det3DDataSample
+from mmdet3d.core.utils import ConfigType, OptConfigType, OptMultiConfig
 from mmdet3d.registry import MODELS
 from .single_stage import SingleStage3DDetector
@@ -16,39 +17,28 @@ class VoxelNet(SingleStage3DDetector):
    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""
    def __init__(self,
-                 voxel_layer: dict,
+                 voxel_layer: ConfigType,
-                 voxel_encoder: dict,
+                 voxel_encoder: ConfigType,
-                 middle_encoder: dict,
+                 middle_encoder: ConfigType,
-                 backbone: dict,
+                 backbone: ConfigType,
-                 neck: Optional[dict] = None,
+                 neck: OptConfigType = None,
-                 bbox_head: Optional[dict] = None,
+                 bbox_head: OptConfigType = None,
-                 train_cfg: Optional[dict] = None,
+                 train_cfg: OptConfigType = None,
-                 test_cfg: Optional[dict] = None,
+                 test_cfg: OptConfigType = None,
-                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
-                 pretrained: Optional[str] = None) -> None:
+                 init_cfg: OptMultiConfig = None) -> None:
-        super(VoxelNet, self).__init__(
+        super().__init__(
            backbone=backbone,
            neck=neck,
            bbox_head=bbox_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
-            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor,
-            pretrained=pretrained)
+            init_cfg=init_cfg)
        self.voxel_layer = Voxelization(**voxel_layer)
        self.voxel_encoder = MODELS.build(voxel_encoder)
        self.middle_encoder = MODELS.build(middle_encoder)
-    def extract_feat(self, points: List[torch.Tensor]) -> list:
-        """Extract features from points."""
-        voxels, num_points, coors = self.voxelize(points)
-        voxel_features = self.voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, coors, batch_size)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
    @torch.no_grad()
    @force_fp32()
    def voxelize(self, points: List[torch.Tensor]) -> tuple:
@@ -68,75 +58,15 @@ class VoxelNet(SingleStage3DDetector):
        coors_batch = torch.cat(coors_batch, dim=0)
        return voxels, num_points, coors_batch
-    def forward_train(self, batch_inputs_dict: Dict[list, torch.Tensor],
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
-                      batch_data_samples: List[Det3DDataSample],
+        """Extract features from points."""
-                      **kwargs) -> dict:
+        # TODO: Remove voxelization to datapreprocessor
-        """
+        points = batch_inputs_dict['points']
-        Args:
+        voxels, num_points, coors = self.voxelize(points)
-            batch_inputs_dict (dict): The model input dict. It should contain
+        voxel_features = self.voxel_encoder(voxels, num_points, coors)
-                ``points`` and ``img`` keys.
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, coors, batch_size)
-                    - points (list[torch.Tensor]): Point cloud of each sample.
+        x = self.backbone(x)
-                    - imgs (torch.Tensor, optional): Image of each sample.
+        if self.with_neck:
+            x = self.neck(x)
-            batch_data_samples (list[:obj:`Det3DDataSample`]): The batch
+        return x
-                data samples. It usually includes information such
-                as `gt_instance_3d` or `gt_panoptic_seg_3d` or `gt_sem_seg_3d`.
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        x = self.extract_feat(batch_inputs_dict['points'])
-        losses = self.bbox_head.forward_train(x, batch_data_samples, **kwargs)
-        return losses
-    def simple_test(self,
-                    batch_inputs_dict: Dict[list, torch.Tensor],
-                    batch_input_metas: List[dict],
-                    rescale: bool = False) -> list:
-        """Test function without test-time augmentation.
-        Args:
-            batch_inputs_dict (dict): The model input dict. It should contain
-                ``points`` and ``img`` keys.
-                    - points (list[torch.Tensor]): Point cloud of single
-                        sample.
-                    - imgs (torch.Tensor, optional): Image of single sample.
-            batch_input_metas (list[dict]): List of input information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
-        Returns:
-            list[:obj:`Det3DDataSample`]: Detection results of the \
-                inputs. Each Det3DDataSample usually contain \
-                'pred_instances_3d'. And the ``pred_instances_3d`` usually \
-                contains following keys.
-                - scores_3d (Tensor): Classification scores, has a shape
-                    (num_instances, )
-                - labels_3d (Tensor): Labels of bboxes, has a shape
-                    (num_instances, ).
-                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
-                    contains a tensor with shape (num_instances, 7).
-        """
-        x = self.extract_feat(batch_inputs_dict['points'])
-        bboxes_list = self.bbox_head.simple_test(
-            x, batch_input_metas, rescale=rescale)
-        # connvert to Det3DDataSample
-        results_list = self.postprocess_result(bboxes_list)
-        return results_list
-    def aug_test(self,
-                 aug_batch_inputs_dict: Dict[list, torch.Tensor],
-                 aug_batch_input_metas: List[dict],
-                 rescale: bool = False) -> list:
-        """Test function with augmentaiton."""
-        # TODO Refactor this after mmdet update
-        feats = self.extract_feats(aug_batch_inputs_dict)
-        aug_bboxes = self.bbox_head.aug_test(
-            feats, aug_batch_input_metas, rescale=rescale)
-        return aug_bboxes
--- a/tests/test_models/test_dense_heads/test_anchor3d_head.py
+++ b/tests/test_models/test_dense_heads/test_anchor3d_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+import torch
+from mmengine import Config
+from mmengine.data import InstanceData
+from mmdet3d import *  # noqa
+from mmdet3d.core.bbox import Box3DMode, LiDARInstance3DBoxes
+from mmdet3d.models.dense_heads import Anchor3DHead
+class TestAnchor3DHead(TestCase):
+    def test_anchor3d_head_loss(self):
+        """Test anchor head loss when truth is empty and non-empty."""
+        cfg = Config(
+            dict(
+                assigner=[
+                    dict(  # for Pedestrian
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.35,
+                        neg_iou_thr=0.2,
+                        min_pos_iou=0.2,
+                        ignore_iof_thr=-1),
+                    dict(  # for Cyclist
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.35,
+                        neg_iou_thr=0.2,
+                        min_pos_iou=0.2,
+                        ignore_iof_thr=-1),
+                    dict(  # for Car
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.6,
+                        neg_iou_thr=0.45,
+                        min_pos_iou=0.45,
+                        ignore_iof_thr=-1),
+                ],
+                allowed_border=0,
+                pos_weight=-1,
+                debug=False))
+        anchor3d_head = Anchor3DHead(
+            num_classes=3,
+            in_channels=512,
+            feat_channels=512,
+            use_direction_classifier=True,
+            anchor_generator=dict(
+                type='Anchor3DRangeGenerator',
+                ranges=[
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+                ],
+                sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+                rotations=[0, 1.57],
+                reshape_out=False),
+            diff_rad_by_sin=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.2),
+            train_cfg=cfg)
+        # Anchor head expects a multiple levels of features per image
+        feats = (torch.rand([1, 512, 200, 176], dtype=torch.float32), )
+        (cls_scores, bbox_preds, dir_cls_preds) = anchor3d_head.forward(feats)
+        self.assertEqual(cls_scores[0].shape, torch.Size([1, 18, 200, 176]))
+        self.assertEqual(bbox_preds[0].shape, torch.Size([1, 42, 200, 176]))
+        self.assertEqual(dir_cls_preds[0].shape, torch.Size([1, 12, 200, 176]))
+        # # Test that empty ground truth encourages the network to
+        # # predict background
+        gt_instances = InstanceData()
+        gt_bboxes_3d = LiDARInstance3DBoxes(torch.empty((0, 7)))
+        gt_labels_3d = torch.tensor([])
+        input_metas = dict(sample_idx=1234)
+        # fake input_metas
+        gt_instances.bboxes_3d = gt_bboxes_3d
+        gt_instances.labels_3d = gt_labels_3d
+        empty_gt_losses = anchor3d_head.loss_by_feat(cls_scores, bbox_preds,
+                                                     dir_cls_preds,
+                                                     [gt_instances],
+                                                     [input_metas])
+        # When there is no truth, the cls loss should be nonzero but
+        # there should be no box and dir loss.
+        self.assertGreater(empty_gt_losses['loss_cls'][0], 0,
+                           'cls loss should be non-zero')
+        self.assertEqual(
+            empty_gt_losses['loss_bbox'][0], 0,
+            'there should be no box loss when there are no true boxes')
+        self.assertEqual(
+            empty_gt_losses['loss_dir'][0], 0,
+            'there should be no dir loss when there are no true dirs')
+        # When truth is non-empty then both cls and box loss
+        # should be nonzero for random inputs
+        gt_instances = InstanceData()
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            torch.tensor(
+                [[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, -0.9091]],
+                dtype=torch.float32))
+        gt_labels_3d = torch.tensor([1], dtype=torch.int64)
+        gt_instances.bboxes_3d = gt_bboxes_3d
+        gt_instances.labels_3d = gt_labels_3d
+        gt_losses = anchor3d_head.loss_by_feat(cls_scores, bbox_preds,
+                                               dir_cls_preds, [gt_instances],
+                                               [input_metas])
+        self.assertGreater(gt_losses['loss_cls'][0], 0,
+                           'cls loss should be non-zero')
+        self.assertGreater(gt_losses['loss_bbox'][0], 0,
+                           'box loss should be non-zero')
+        self.assertGreater(gt_losses['loss_dir'][0], 0,
+                           'dir loss should be none-zero')
+    def test_anchor3d_head_predict(self):
+        cfg = Config(
+            dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_thr=0.01,
+                score_thr=0.1,
+                min_bbox_size=0,
+                nms_pre=100,
+                max_num=50))
+        anchor3d_head = Anchor3DHead(
+            num_classes=3,
+            in_channels=512,
+            feat_channels=512,
+            use_direction_classifier=True,
+            anchor_generator=dict(
+                type='Anchor3DRangeGenerator',
+                ranges=[
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+                ],
+                sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+                rotations=[0, 1.57],
+                reshape_out=False),
+            diff_rad_by_sin=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.2),
+            test_cfg=cfg)
+        feats = (torch.rand([2, 512, 200, 176], dtype=torch.float32), )
+        (cls_scores, bbox_preds, dir_cls_preds) = anchor3d_head.forward(feats)
+        # fake input_metas
+        input_metas = [{
+            'sample_idx': 1234,
+            'box_type_3d': LiDARInstance3DBoxes,
+            'box_mode_3d': Box3DMode.LIDAR
+        }, {
+            'sample_idx': 2345,
+            'box_type_3d': LiDARInstance3DBoxes,
+            'box_mode_3d': Box3DMode.LIDAR
+        }]
+        # test get_boxes
+        cls_scores[0] -= 1.5  # too many positive samples may cause cuda oom
+        results = anchor3d_head.predict_by_feat(cls_scores, bbox_preds,
+                                                dir_cls_preds, input_metas)
+        pred_instances = results[0]
+        scores_3d = pred_instances.scores_3d
+        assert (scores_3d > 0.3).all()