[Feature] Support FCOS3D Detector (#436)

* Support single stage mono3d detector * Add single stage mono3d detector in the init * Support FCOS Mono3D detector * Minor adjustment of docstring * Fix detector import * Move numpy import to header

[Feature] Support FCOS3D Detector (#436)
* Support single stage mono3d detector * Add single stage mono3d detector in the init * Support FCOS Mono3D detector * Minor adjustment of docstring * Fix detector import * Move numpy import to header
4002f06c · twang · GitHub · 8d4bef28 · 4002f06c · 4002f06c
Unverified Commit 4002f06c authored Apr 14, 2021 by twang Committed by GitHub Apr 14, 2021
3 changed files
--- a/mmdet3d/models/detectors/__init__.py
+++ b/mmdet3d/models/detectors/__init__.py
 from .base import Base3DDetector
 from .centerpoint import CenterPoint
 from .dynamic_voxelnet import DynamicVoxelNet
+from .fcos_mono3d import FCOSMono3D
 from .h3dnet import H3DNet
 from .imvotenet import ImVoteNet
 from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
 from .mvx_two_stage import MVXTwoStageDetector
 from .parta2 import PartA2
+from .single_stage_mono3d import SingleStageMono3DDetector
 from .ssd3dnet import SSD3DNet
 from .votenet import VoteNet
 from .voxelnet import VoxelNet

 __all__ = [
-    'Base3DDetector',
-    'VoxelNet',
-    'DynamicVoxelNet',
-    'MVXTwoStageDetector',
-    'DynamicMVXFasterRCNN',
-    'MVXFasterRCNN',
-    'PartA2',
-    'VoteNet',
-    'H3DNet',
-    'CenterPoint',
-    'SSD3DNet',
-    'ImVoteNet',
+    'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
+    'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
+    'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
+    'FCOSMono3D'
 ]
--- a/mmdet3d/models/detectors/fcos_mono3d.py
+++ b/mmdet3d/models/detectors/fcos_mono3d.py
+from mmdet.models.builder import DETECTORS
+from .single_stage_mono3d import SingleStageMono3DDetector
+
+
+@DETECTORS.register_module()
+class FCOSMono3D(SingleStageMono3DDetector):
+    """Implementation of FCOS3D. The technical report will be released soon.
+
+    Currently please refer to our entry on the
+    `leaderboard <https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera>` # noqa
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(FCOSMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                         test_cfg, pretrained)
--- a/mmdet3d/models/detectors/single_stage_mono3d.py
+++ b/mmdet3d/models/detectors/single_stage_mono3d.py
+import numpy as np
+import torch
+
+from mmdet3d.core import bbox3d2result
+from mmdet.models.builder import DETECTORS
+from mmdet.models.detectors.single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class SingleStageMono3DDetector(SingleStageDetector):
+    """Base class for monocular 3D single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def extract_feats(self, imgs):
+        """Directly extract features from the backbone+neck."""
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      centers2d,
+                      depths,
+                      attr_labels=None,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for
+                each image in [x, y, z, w, l, h, theta, vx, vy] format.
+            gt_labels_3d (list[Tensor]): 3D class indices corresponding to
+                each box.
+            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
+            depths (list[Tensor]): Depth of projected centers on 2D images.
+            attr_labels (list[Tensor], optional): Attribute indices
+                corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_bboxes_3d,
+                                              gt_labels_3d, centers2d, depths,
+                                              attr_labels, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        bbox_outputs = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+
+        if self.bbox_head.pred_bbox2d:
+            from mmdet.core import bbox2result
+            bbox2d_img = [
+                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
+                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
+            ]
+            bbox_outputs = [bbox_outputs[0][:-1]]
+
+        bbox_img = [
+            bbox3d2result(bboxes, scores, labels, attrs)
+            for bboxes, scores, labels, attrs in bbox_outputs
+        ]
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        for result_dict, img_bbox in zip(bbox_list, bbox_img):
+            result_dict['img_bbox'] = img_bbox
+        if self.bbox_head.pred_bbox2d:
+            for result_dict, img_bbox2d in zip(bbox_list, bbox2d_img):
+                result_dict['img_bbox2d'] = img_bbox2d
+        return bbox_list
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation."""
+        feats = self.extract_feats(imgs)
+
+        # only support aug_test for one sample
+        outs_list = [self.bbox_head(x) for x in feats]
+        for i, img_meta in enumerate(img_metas):
+            if img_meta[0]['pcd_horizontal_flip']:
+                for j in range(len(outs_list[i])):  # for each prediction
+                    if outs_list[i][j][0] is None:
+                        continue
+                    for k in range(len(outs_list[i][j])):
+                        # every stride of featmap
+                        outs_list[i][j][k] = torch.flip(
+                            outs_list[i][j][k], dims=[3])
+                reg = outs_list[i][1]
+                for reg_feat in reg:
+                    # offset_x
+                    reg_feat[:, 0, :, :] = 1 - reg_feat[:, 0, :, :]
+                    # velo_x
+                    if self.bbox_head.pred_velo:
+                        reg_feat[:, 7, :, :] = -reg_feat[:, 7, :, :]
+                    # rotation
+                    reg_feat[:, 6, :, :] = -reg_feat[:, 6, :, :] + np.pi
+
+        merged_outs = []
+        for i in range(len(outs_list[0])):  # for each prediction
+            merged_feats = []
+            for j in range(len(outs_list[0][i])):
+                if outs_list[0][i][0] is None:
+                    merged_feats.append(None)
+                    continue
+                # for each stride of featmap
+                avg_feats = torch.mean(
+                    torch.cat([x[i][j] for x in outs_list]),
+                    dim=0,
+                    keepdim=True)
+                if i == 1:  # regression predictions
+                    # rot/velo/2d det keeps the original
+                    avg_feats[:, 6:, :, :] = \
+                        outs_list[0][i][j][:, 6:, :, :]
+                if i == 2:
+                    # dir_cls keeps the original
+                    avg_feats = outs_list[0][i][j]
+                merged_feats.append(avg_feats)
+            merged_outs.append(merged_feats)
+        merged_outs = tuple(merged_outs)
+
+        bbox_outputs = self.bbox_head.get_bboxes(
+            *merged_outs, img_metas[0], rescale=rescale)
+        if self.bbox_head.pred_bbox2d:
+            from mmdet.core import bbox2result
+            bbox2d_img = [
+                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
+                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
+            ]
+            bbox_outputs = [bbox_outputs[0][:-1]]
+
+        bbox_img = [
+            bbox3d2result(bboxes, scores, labels, attrs)
+            for bboxes, scores, labels, attrs in bbox_outputs
+        ]
+
+        bbox_list = dict()
+        bbox_list.update(img_bbox=bbox_img[0])
+        if self.bbox_head.pred_bbox2d:
+            bbox_list.update(img_bbox2d=bbox2d_img[0])
+
+        return [bbox_list]