# Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet.models.detectors import BaseDetector from mmdet3d.registry import MODELS from mmdet3d.structures.ops import bbox3d2result from mmdet3d.utils import ConfigType @MODELS.register_module() class DfM(BaseDetector): r"""`Monocular 3D Object Detection with Depth from Motion. `_. Args: backbone (:obj:`ConfigDict` or dict): The backbone config. neck (:obj:`ConfigDict` or dict): The neck config. backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone config. backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config. neck_3d (:obj:`ConfigDict` or dict): The 3D neck config. bbox_head_3d (:obj:`ConfigDict` or dict): The 3d bbox head config. neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config for 2D object detection. Defaults to None. bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox head config for 2D object detection. Defaults to None. depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth head config for depth estimation in fov space. Defaults to None. depth_head (:obj:`ConfigDict` or dict, optional): The depth head config for depth estimation in 3D voxel projected to fov space . train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of training hyper-parameters. Defaults to None. test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test hyper-parameters. Defaults to None. pretrained (:obj: `ConfigDict` or dict optional): The pretrained config. init_cfg (:obj:`ConfigDict` or dict, optional): The initialization config. Defaults to None. """ def __init__(self, backbone: ConfigType, neck: ConfigType, backbone_stereo: ConfigType, backbone_3d: ConfigType, neck_3d: ConfigType, bbox_head_3d: ConfigType, neck_2d=None, bbox_head_2d=None, depth_head_2d=None, depth_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super().__init__(init_cfg=init_cfg) self.backbone = MODELS.build(backbone) self.neck = MODELS.build(neck) if backbone_stereo is not None: backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature) backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1]) self.backbone_stereo = MODELS.build(backbone_stereo) assert self.neck.cat_img_feature == \ self.backbone_stereo.cat_img_feature assert self.neck.sem_channels[ -1] == self.backbone_stereo.in_sem_channels if backbone_3d is not None: self.backbone_3d = MODELS.build(backbone_3d) if neck_3d is not None: self.neck_3d = MODELS.build(neck_3d) if neck_2d is not None: self.neck_2d = MODELS.build(neck_2d) if bbox_head_2d is not None: self.bbox_head_2d = MODELS.build(bbox_head_2d) if depth_head_2d is not None: self.depth_head_2d = MODELS.build(depth_head_2d) if depth_head is not None: self.depth_head = MODELS.build(depth_head) self.depth_samples = self.depth_head.depth_samples self.train_cfg = train_cfg self.test_cfg = test_cfg bbox_head_3d.update(train_cfg=train_cfg) bbox_head_3d.update(test_cfg=test_cfg) self.bbox_head_3d = MODELS.build(bbox_head_3d) @property def with_backbone_3d(self): """Whether the detector has a 3D backbone.""" return hasattr(self, 'backbone_3d') and self.backbone_3d is not None @property def with_neck_3d(self): """Whether the detector has a 3D neck.""" return hasattr(self, 'neck_3d') and self.neck_3d is not None @property def with_neck_2d(self): """Whether the detector has a 2D neck.""" return hasattr(self, 'neck_2d') and self.neck_2d is not None @property def with_bbox_head_2d(self): """Whether the detector has a 2D detection head.""" return hasattr(self, 'bbox_head_2d') and self.bbox_head_2d is not None @property def with_depth_head_2d(self): """Whether the detector has a image-based depth head.""" return hasattr(self, 'depth_head_2d') and self.depth_head_2d is not None @property def with_depth_head(self): """Whether the detector has a frustum-based depth head.""" return hasattr(self, 'depth_head') and self.depth_head is not None def extract_feat(self, img, img_metas): """Feature extraction for perspective-view images. Args: img (torch.Tensor): Images of shape [B, N, C_in, H, W]. img_metas (list): Image meta information. Each element corresponds to a group of images. len(img_metas) == B. Returns: torch.Tensor: bev feature with shape [B, C_out, N_y, N_x]. """ # split input img into current and previous ones batch_size, N, C_in, H, W = img.shape cur_imgs = img[:, 0] prev_imgs = img[:, 1] # TODO: to support multiple prev imgs # 2D backbone for feature extraction cur_feats = self.backbone(cur_imgs) cur_feats = [cur_imgs] + list(cur_feats) prev_feats = self.backbone(prev_imgs) prev_feats = [prev_imgs] + list(prev_feats) # SPP module as the feature neck cur_stereo_feat, cur_sem_feat = self.neck(cur_feats) prev_stereo_feat, prev_sem_feat = self.neck(prev_feats) # derive cur2prevs cur_pose = torch.tensor( [img_meta['cam2global'] for img_meta in img_metas], device=img.device)[:, None, :, :] # (B, 1, 4, 4) prev_poses = [] for img_meta in img_metas: sweep_img_metas = img_meta['sweep_img_metas'] prev_poses.append([ sweep_img_meta['cam2global'] for sweep_img_meta in sweep_img_metas ]) prev_poses = torch.tensor(prev_poses, device=img.device) pad_prev_cam2global = torch.eye(4)[None, None].expand( batch_size, N - 1, 4, 4).to(img.device) pad_prev_cam2global[:, :, :prev_poses.shape[-2], :prev_poses. shape[-1]] = prev_poses pad_cur_cam2global = torch.eye(4)[None, None].expand(batch_size, 1, 4, 4).to(img.device) pad_cur_cam2global[:, :, :cur_pose.shape[-2], :cur_pose. shape[-1]] = cur_pose # (B, N-1, 4, 4) * (B, 1, 4, 4) -> (B, N-1, 4, 4) # torch.linalg.solve is faster and more numerically stable # than torch.matmul(torch.linalg.inv(A), B) # empirical results show that torch.linalg.solve can derive # almost the same result with np.linalg.inv # while torch.linalg.inv can not cur2prevs = torch.linalg.solve(pad_prev_cam2global, pad_cur_cam2global) for meta_idx, img_meta in enumerate(img_metas): img_meta['cur2prevs'] = cur2prevs[meta_idx] # stereo backbone for depth estimation # volume_feat: (batch_size, Cv, Nz, Ny, Nx) volume_feat = self.backbone_stereo(cur_stereo_feat, prev_stereo_feat, img_metas, cur_sem_feat) # height compression _, Cv, Nz, Ny, Nx = volume_feat.shape bev_feat = volume_feat.view(batch_size, Cv * Nz, Ny, Nx) bev_feat_prehg, bev_feat = self.neck_3d(bev_feat) return bev_feat def forward_train(self, img, img_metas, gt_bboxes_3d, gt_labels_3d, depth_img=None, **kwargs): """Forward function for training.""" bev_feat = self.extract_feat(img, img_metas) outs = self.bbox_head_3d([bev_feat]) losses = self.bbox_head_3d.loss(*outs, gt_bboxes_3d, gt_labels_3d, img_metas) # TODO: loss_dense_depth, loss_2d, loss_imitation return losses def forward_test(self, img, img_metas, **kwargs): """Forward of testing. Args: img (torch.Tensor): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. Returns: list[dict]: Predicted 3d boxes. """ # not supporting aug_test for now return self.simple_test(img, img_metas) def simple_test(self, img, img_metas): """Simple inference forward without test time augmentation.""" bev_feat = self.extract_feat(img, img_metas) # bbox_head takes a list of feature from different levels as input # so need [bev_feat] outs = self.bbox_head_3d([bev_feat]) bbox_list = self.bbox_head_3d.get_bboxes(*outs, img_metas) bbox_results = [ bbox3d2result(det_bboxes, det_scores, det_labels) for det_bboxes, det_scores, det_labels in bbox_list ] # add pseudo-lidar label to each pred_dict for post-processing for bbox_result in bbox_results: bbox_result['pseudo_lidar'] = True return bbox_results def aug_test(self, imgs, img_metas, **kwargs): """Test with augmentations. Args: imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. Returns: list[dict]: Predicted 3d boxes. """ raise NotImplementedError