Unverified Commit 32a4328b authored by Wenwei Zhang's avatar Wenwei Zhang Committed by GitHub
Browse files

Bump version to V1.0.0rc0

Bump version to V1.0.0rc0
parents 86cc487c a8817998
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmcv.cnn import Scale, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.core import box3d_multiclass_nms, xywhr2xyxyr
from mmdet3d.core.bbox import points_cam2img, points_img2cam
from mmdet.core import distance2bbox, multi_apply
from mmdet.models.builder import HEADS, build_loss
from .fcos_mono3d_head import FCOSMono3DHead
@HEADS.register_module()
class PGDHead(FCOSMono3DHead):
r"""Anchor-free head used in `PGD <https://arxiv.org/abs/2107.14160>`_.
Args:
use_depth_classifer (bool, optional): Whether to use depth classifier.
Defaults to True.
use_only_reg_proj (bool, optional): Whether to use only direct
regressed depth in the re-projection (to make the network easier
to learn). Defaults to False.
weight_dim (int, optional): Dimension of the location-aware weight
map. Defaults to -1.
weight_branch (tuple[tuple[int]], optional): Feature map channels of
the convolutional branch for weight map. Defaults to ((256, ), ).
depth_branch (tuple[int], optional): Feature map channels of the
branch for probabilistic depth estimation. Defaults to (64, ),
depth_range (tuple[float], optional): Range of depth estimation.
Defaults to (0, 70),
depth_unit (int, optional): Unit of depth range division. Defaults to
10.
division (str, optional): Depth division method. Options include
'uniform', 'linear', 'log', 'loguniform'. Defaults to 'uniform'.
depth_bins (int, optional): Discrete bins of depth division. Defaults
to 8.
loss_depth (dict, optional): Depth loss. Defaults to dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
loss_bbox2d (dict, optional): Loss for 2D box estimation. Defaults to
dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
loss_consistency (dict, optional): Consistency loss. Defaults to
dict(type='GIoULoss', loss_weight=1.0),
pred_velo (bool, optional): Whether to predict velocity. Defaults to
False.
pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes.
Defaults to True.
pred_keypoints (bool, optional): Whether to predict keypoints.
Defaults to False,
bbox_coder (dict, optional): Bounding box coder. Defaults to
dict(type='PGDBBoxCoder', base_depths=((28.01, 16.32), ),
base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)),
code_size=7).
"""
def __init__(self,
use_depth_classifier=True,
use_onlyreg_proj=False,
weight_dim=-1,
weight_branch=((256, ), ),
depth_branch=(64, ),
depth_range=(0, 70),
depth_unit=10,
division='uniform',
depth_bins=8,
loss_depth=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_bbox2d=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_consistency=dict(type='GIoULoss', loss_weight=1.0),
pred_bbox2d=True,
pred_keypoints=False,
bbox_coder=dict(
type='PGDBBoxCoder',
base_depths=((28.01, 16.32), ),
base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6),
(3.9, 1.56, 1.6)),
code_size=7),
**kwargs):
self.use_depth_classifier = use_depth_classifier
self.use_onlyreg_proj = use_onlyreg_proj
self.depth_branch = depth_branch
self.pred_keypoints = pred_keypoints
self.weight_dim = weight_dim
self.weight_branch = weight_branch
self.weight_out_channels = []
for weight_branch_channels in weight_branch:
if len(weight_branch_channels) > 0:
self.weight_out_channels.append(weight_branch_channels[-1])
else:
self.weight_out_channels.append(-1)
self.depth_range = depth_range
self.depth_unit = depth_unit
self.division = division
if self.division == 'uniform':
self.num_depth_cls = int(
(depth_range[1] - depth_range[0]) / depth_unit) + 1
if self.num_depth_cls != depth_bins:
print('Warning: The number of bins computed from ' +
'depth_unit is different from given parameter! ' +
'Depth_unit will be considered with priority in ' +
'Uniform Division.')
else:
self.num_depth_cls = depth_bins
super().__init__(
pred_bbox2d=pred_bbox2d, bbox_coder=bbox_coder, **kwargs)
self.loss_depth = build_loss(loss_depth)
if self.pred_bbox2d:
self.loss_bbox2d = build_loss(loss_bbox2d)
self.loss_consistency = build_loss(loss_consistency)
if self.pred_keypoints:
self.kpts_start = 9 if self.pred_velo else 7
def _init_layers(self):
"""Initialize layers of the head."""
super()._init_layers()
if self.pred_bbox2d:
self.scale_dim += 1
if self.pred_keypoints:
self.scale_dim += 1
self.scales = nn.ModuleList([
nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
for _ in self.strides
])
def _init_predictor(self):
"""Initialize predictor layers of the head."""
super()._init_predictor()
if self.use_depth_classifier:
self.conv_depth_cls_prev = self._init_branch(
conv_channels=self.depth_branch,
conv_strides=(1, ) * len(self.depth_branch))
self.conv_depth_cls = nn.Conv2d(self.depth_branch[-1],
self.num_depth_cls, 1)
# Data-agnostic single param lambda for local depth fusion
self.fuse_lambda = nn.Parameter(torch.tensor(10e-5))
if self.weight_dim != -1:
self.conv_weight_prevs = nn.ModuleList()
self.conv_weights = nn.ModuleList()
for i in range(self.weight_dim):
weight_branch_channels = self.weight_branch[i]
weight_out_channel = self.weight_out_channels[i]
if len(weight_branch_channels) > 0:
self.conv_weight_prevs.append(
self._init_branch(
conv_channels=weight_branch_channels,
conv_strides=(1, ) * len(weight_branch_channels)))
self.conv_weights.append(
nn.Conv2d(weight_out_channel, 1, 1))
else:
self.conv_weight_prevs.append(None)
self.conv_weights.append(
nn.Conv2d(self.feat_channels, 1, 1))
def init_weights(self):
"""Initialize weights of the head.
We currently still use the customized defined init_weights because the
default init of DCN triggered by the init_cfg will init
conv_offset.weight, which mistakenly affects the training stability.
"""
super().init_weights()
bias_cls = bias_init_with_prob(0.01)
if self.use_depth_classifier:
for m in self.conv_depth_cls_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
normal_init(self.conv_depth_cls, std=0.01, bias=bias_cls)
if self.weight_dim != -1:
for conv_weight_prev in self.conv_weight_prevs:
if conv_weight_prev is None:
continue
for m in conv_weight_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
for conv_weight in self.conv_weights:
normal_init(conv_weight, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2).
weight (list[Tensor]): Location-aware weight maps on each
scale level, each is a 4D-tensor, the channel number is
num_points * 1.
depth_cls_preds (list[Tensor]): Box scores for depth class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * self.num_depth_cls.
attr_preds (list[Tensor]): Attribute scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level,
each is a 4D-tensor, the channel number is num_points * 1.
"""
return multi_apply(self.forward_single, feats, self.scales,
self.strides)
def forward_single(self, x, scale, stride):
"""Forward features of a single scale level.
Args:
x (Tensor): FPN feature maps of the specified stride.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
stride (int): The corresponding stride for feature maps, only
used to normalize the bbox prediction when self.norm_on_bbox
is True.
Returns:
tuple: scores for each class, bbox and direction class
predictions, depth class predictions, location-aware weights,
attribute and centerness predictions of input feature maps.
"""
cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, cls_feat, \
reg_feat = super().forward_single(x, scale, stride)
max_regress_range = stride * self.regress_ranges[0][1] / \
self.strides[0]
bbox_pred = self.bbox_coder.decode_2d(bbox_pred, scale, stride,
max_regress_range, self.training,
self.pred_keypoints,
self.pred_bbox2d)
depth_cls_pred = None
if self.use_depth_classifier:
clone_reg_feat = reg_feat.clone()
for conv_depth_cls_prev_layer in self.conv_depth_cls_prev:
clone_reg_feat = conv_depth_cls_prev_layer(clone_reg_feat)
depth_cls_pred = self.conv_depth_cls(clone_reg_feat)
weight = None
if self.weight_dim != -1:
weight = []
for i in range(self.weight_dim):
clone_reg_feat = reg_feat.clone()
if len(self.weight_branch[i]) > 0:
for conv_weight_prev_layer in self.conv_weight_prevs[i]:
clone_reg_feat = conv_weight_prev_layer(clone_reg_feat)
weight.append(self.conv_weights[i](clone_reg_feat))
weight = torch.cat(weight, dim=1)
return cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
attr_pred, centerness
def get_proj_bbox2d(self,
bbox_preds,
pos_dir_cls_preds,
labels_3d,
bbox_targets_3d,
pos_points,
pos_inds,
img_metas,
pos_depth_cls_preds=None,
pos_weights=None,
pos_cls_scores=None,
with_kpts=False):
"""Decode box predictions and get projected 2D attributes.
Args:
bbox_preds (list[Tensor]): Box predictions for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
pos_dir_cls_preds (Tensor): Box scores for direction class
predictions of positive boxes on all the scale levels in shape
(num_pos_points, 2).
labels_3d (list[Tensor]): 3D box category labels for each scale
level, each is a 4D-tensor.
bbox_targets_3d (list[Tensor]): 3D box targets for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
pos_points (Tensor): Foreground points.
pos_inds (Tensor): Index of foreground points from flattened
tensors.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
pos_depth_cls_preds (Tensor, optional): Probabilistic depth map of
positive boxes on all the scale levels in shape
(num_pos_points, self.num_depth_cls). Defaults to None.
pos_weights (Tensor, optional): Location-aware weights of positive
boxes in shape (num_pos_points, self.weight_dim). Defaults to
None.
pos_cls_scores (Tensor, optional): Classification scores of
positive boxes in shape (num_pos_points, self.num_classes).
Defaults to None.
with_kpts (bool, optional): Whether to output keypoints targets.
Defaults to False.
Returns:
tuple[Tensor]: Exterior 2D boxes from projected 3D boxes,
predicted 2D boxes and keypoint targets (if necessary).
"""
views = [np.array(img_meta['cam2img']) for img_meta in img_metas]
num_imgs = len(img_metas)
img_idx = []
for label in labels_3d:
for idx in range(num_imgs):
img_idx.append(
labels_3d[0].new_ones(int(len(label) / num_imgs)) * idx)
img_idx = torch.cat(img_idx)
pos_img_idx = img_idx[pos_inds]
flatten_strided_bbox_preds = []
flatten_strided_bbox2d_preds = []
flatten_bbox_targets_3d = []
flatten_strides = []
for stride_idx, bbox_pred in enumerate(bbox_preds):
flatten_bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(
-1, sum(self.group_reg_dims))
flatten_bbox_pred[:, :2] *= self.strides[stride_idx]
flatten_bbox_pred[:, -4:] *= self.strides[stride_idx]
flatten_strided_bbox_preds.append(
flatten_bbox_pred[:, :self.bbox_coder.bbox_code_size])
flatten_strided_bbox2d_preds.append(flatten_bbox_pred[:, -4:])
bbox_target_3d = bbox_targets_3d[stride_idx].clone()
bbox_target_3d[:, :2] *= self.strides[stride_idx]
bbox_target_3d[:, -4:] *= self.strides[stride_idx]
flatten_bbox_targets_3d.append(bbox_target_3d)
flatten_stride = flatten_bbox_pred.new_ones(
*flatten_bbox_pred.shape[:-1], 1) * self.strides[stride_idx]
flatten_strides.append(flatten_stride)
flatten_strided_bbox_preds = torch.cat(flatten_strided_bbox_preds)
flatten_strided_bbox2d_preds = torch.cat(flatten_strided_bbox2d_preds)
flatten_bbox_targets_3d = torch.cat(flatten_bbox_targets_3d)
flatten_strides = torch.cat(flatten_strides)
pos_strided_bbox_preds = flatten_strided_bbox_preds[pos_inds]
pos_strided_bbox2d_preds = flatten_strided_bbox2d_preds[pos_inds]
pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
pos_strides = flatten_strides[pos_inds]
pos_decoded_bbox2d_preds = distance2bbox(pos_points,
pos_strided_bbox2d_preds)
pos_strided_bbox_preds[:, :2] = \
pos_points - pos_strided_bbox_preds[:, :2]
pos_bbox_targets_3d[:, :2] = \
pos_points - pos_bbox_targets_3d[:, :2]
if self.use_depth_classifier and (not self.use_onlyreg_proj):
pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
pos_depth_cls_preds, self.depth_range, self.depth_unit,
self.division, self.num_depth_cls)
sig_alpha = torch.sigmoid(self.fuse_lambda)
pos_strided_bbox_preds[:, 2] = \
sig_alpha * pos_strided_bbox_preds.clone()[:, 2] + \
(1 - sig_alpha) * pos_prob_depth_preds
box_corners_in_image = pos_strided_bbox_preds.new_zeros(
(*pos_strided_bbox_preds.shape[:-1], 8, 2))
box_corners_in_image_gt = pos_strided_bbox_preds.new_zeros(
(*pos_strided_bbox_preds.shape[:-1], 8, 2))
for idx in range(num_imgs):
mask = (pos_img_idx == idx)
if pos_strided_bbox_preds[mask].shape[0] == 0:
continue
cam2img = torch.eye(
4,
dtype=pos_strided_bbox_preds.dtype,
device=pos_strided_bbox_preds.device)
view_shape = views[idx].shape
cam2img[:view_shape[0], :view_shape[1]] = \
pos_strided_bbox_preds.new_tensor(views[idx])
centers2d_preds = pos_strided_bbox_preds.clone()[mask, :2]
centers2d_targets = pos_bbox_targets_3d.clone()[mask, :2]
centers3d_targets = points_img2cam(pos_bbox_targets_3d[mask, :3],
views[idx])
# use predicted depth to re-project the 2.5D centers
pos_strided_bbox_preds[mask, :3] = points_img2cam(
pos_strided_bbox_preds[mask, :3], views[idx])
pos_bbox_targets_3d[mask, :3] = centers3d_targets
# depth fixed when computing re-project 3D bboxes
pos_strided_bbox_preds[mask, 2] = \
pos_bbox_targets_3d.clone()[mask, 2]
# decode yaws
if self.use_direction_classifier:
pos_dir_cls_scores = torch.max(
pos_dir_cls_preds[mask], dim=-1)[1]
pos_strided_bbox_preds[mask] = self.bbox_coder.decode_yaw(
pos_strided_bbox_preds[mask], centers2d_preds,
pos_dir_cls_scores, self.dir_offset, cam2img)
pos_bbox_targets_3d[mask, 6] = torch.atan2(
centers2d_targets[:, 0] - cam2img[0, 2],
cam2img[0, 0]) + pos_bbox_targets_3d[mask, 6]
corners = img_metas[0]['box_type_3d'](
pos_strided_bbox_preds[mask],
box_dim=self.bbox_coder.bbox_code_size,
origin=(0.5, 0.5, 0.5)).corners
box_corners_in_image[mask] = points_cam2img(corners, cam2img)
corners_gt = img_metas[0]['box_type_3d'](
pos_bbox_targets_3d[mask, :self.bbox_code_size],
box_dim=self.bbox_coder.bbox_code_size,
origin=(0.5, 0.5, 0.5)).corners
box_corners_in_image_gt[mask] = points_cam2img(corners_gt, cam2img)
minxy = torch.min(box_corners_in_image, dim=1)[0]
maxxy = torch.max(box_corners_in_image, dim=1)[0]
proj_bbox2d_preds = torch.cat([minxy, maxxy], dim=1)
outputs = (proj_bbox2d_preds, pos_decoded_bbox2d_preds)
if with_kpts:
norm_strides = pos_strides * self.regress_ranges[0][1] / \
self.strides[0]
kpts_targets = box_corners_in_image_gt - pos_points[..., None, :]
kpts_targets = kpts_targets.view(
(*pos_strided_bbox_preds.shape[:-1], 16))
kpts_targets /= norm_strides
outputs += (kpts_targets, )
return outputs
def get_pos_predictions(self, bbox_preds, dir_cls_preds, depth_cls_preds,
weights, attr_preds, centernesses, pos_inds,
img_metas):
"""Flatten predictions and get positive ones.
Args:
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
depth_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * self.num_depth_cls.
attr_preds (list[Tensor]): Attribute scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level, each
is a 4D-tensor, the channel number is num_points * 1.
pos_inds (Tensor): Index of foreground points from flattened
tensors.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple[Tensor]: Box predictions, direction classes, probabilistic
depth maps, location-aware weight maps, attributes and
centerness predictions.
"""
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
for bbox_pred in bbox_preds
]
flatten_dir_cls_preds = [
dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
for dir_cls_pred in dir_cls_preds
]
flatten_centerness = [
centerness.permute(0, 2, 3, 1).reshape(-1)
for centerness in centernesses
]
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
flatten_centerness = torch.cat(flatten_centerness)
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
pos_centerness = flatten_centerness[pos_inds]
pos_depth_cls_preds = None
if self.use_depth_classifier:
flatten_depth_cls_preds = [
depth_cls_pred.permute(0, 2, 3,
1).reshape(-1, self.num_depth_cls)
for depth_cls_pred in depth_cls_preds
]
flatten_depth_cls_preds = torch.cat(flatten_depth_cls_preds)
pos_depth_cls_preds = flatten_depth_cls_preds[pos_inds]
pos_weights = None
if self.weight_dim != -1:
flatten_weights = [
weight.permute(0, 2, 3, 1).reshape(-1, self.weight_dim)
for weight in weights
]
flatten_weights = torch.cat(flatten_weights)
pos_weights = flatten_weights[pos_inds]
pos_attr_preds = None
if self.pred_attrs:
flatten_attr_preds = [
attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
for attr_pred in attr_preds
]
flatten_attr_preds = torch.cat(flatten_attr_preds)
pos_attr_preds = flatten_attr_preds[pos_inds]
return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \
pos_weights, pos_attr_preds, pos_centerness
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
depth_cls_preds,
weights,
attr_preds,
centernesses,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
depth_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * self.num_depth_cls.
weights (list[Tensor]): Location-aware weights for each scale
level, each is a 4D-tensor, the channel number is
num_points * self.weight_dim.
attr_preds (list[Tensor]): Attribute scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level, each
is a 4D-tensor, the channel number is num_points * 1.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
(num_gts, code_size).
gt_labels_3d (list[Tensor]): same as gt_labels
centers2d (list[Tensor]): 2D centers on the image with shape of
(num_gts, 2).
depths (list[Tensor]): Depth ground truth with shape of
(num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can
be ignored when computing the loss. Defaults to None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
len(depth_cls_preds) == len(weights) == len(centernesses) == \
len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
self.get_targets(
all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels)
num_imgs = cls_scores[0].size(0)
# flatten cls_scores and targets
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_labels_3d = torch.cat(labels_3d)
flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
flatten_centerness_targets = torch.cat(centerness_targets)
flatten_points = torch.cat(
[points.repeat(num_imgs, 1) for points in all_level_points])
if self.pred_attrs:
flatten_attr_targets = torch.cat(attr_targets)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
bg_class_ind = self.num_classes
pos_inds = ((flatten_labels_3d >= 0)
& (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
num_pos = len(pos_inds)
loss_dict = dict()
loss_dict['loss_cls'] = self.loss_cls(
flatten_cls_scores,
flatten_labels_3d,
avg_factor=num_pos + num_imgs) # avoid num_pos is 0
pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, pos_weights, \
pos_attr_preds, pos_centerness = self.get_pos_predictions(
bbox_preds, dir_cls_preds, depth_cls_preds, weights,
attr_preds, centernesses, pos_inds, img_metas)
if num_pos > 0:
pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
pos_centerness_targets = flatten_centerness_targets[pos_inds]
pos_points = flatten_points[pos_inds]
if self.pred_attrs:
pos_attr_targets = flatten_attr_targets[pos_inds]
if self.use_direction_classifier:
pos_dir_cls_targets = self.get_direction_target(
pos_bbox_targets_3d, self.dir_offset, one_hot=False)
bbox_weights = pos_centerness_targets.new_ones(
len(pos_centerness_targets), sum(self.group_reg_dims))
equal_weights = pos_centerness_targets.new_ones(
pos_centerness_targets.shape)
code_weight = self.train_cfg.get('code_weight', None)
if code_weight:
assert len(code_weight) == sum(self.group_reg_dims)
bbox_weights = bbox_weights * bbox_weights.new_tensor(
code_weight)
if self.diff_rad_by_sin:
pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
pos_bbox_preds, pos_bbox_targets_3d)
loss_dict['loss_offset'] = self.loss_bbox(
pos_bbox_preds[:, :2],
pos_bbox_targets_3d[:, :2],
weight=bbox_weights[:, :2],
avg_factor=equal_weights.sum())
loss_dict['loss_size'] = self.loss_bbox(
pos_bbox_preds[:, 3:6],
pos_bbox_targets_3d[:, 3:6],
weight=bbox_weights[:, 3:6],
avg_factor=equal_weights.sum())
loss_dict['loss_rotsin'] = self.loss_bbox(
pos_bbox_preds[:, 6],
pos_bbox_targets_3d[:, 6],
weight=bbox_weights[:, 6],
avg_factor=equal_weights.sum())
if self.pred_velo:
loss_dict['loss_velo'] = self.loss_bbox(
pos_bbox_preds[:, 7:9],
pos_bbox_targets_3d[:, 7:9],
weight=bbox_weights[:, 7:9],
avg_factor=equal_weights.sum())
proj_bbox2d_inputs = (bbox_preds, pos_dir_cls_preds, labels_3d,
bbox_targets_3d, pos_points, pos_inds,
img_metas)
# direction classification loss
# TODO: add more check for use_direction_classifier
if self.use_direction_classifier:
loss_dict['loss_dir'] = self.loss_dir(
pos_dir_cls_preds,
pos_dir_cls_targets,
equal_weights,
avg_factor=equal_weights.sum())
# init depth loss with the one computed from direct regression
loss_dict['loss_depth'] = self.loss_bbox(
pos_bbox_preds[:, 2],
pos_bbox_targets_3d[:, 2],
weight=bbox_weights[:, 2],
avg_factor=equal_weights.sum())
# depth classification loss
if self.use_depth_classifier:
pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
pos_depth_cls_preds, self.depth_range, self.depth_unit,
self.division, self.num_depth_cls)
sig_alpha = torch.sigmoid(self.fuse_lambda)
if self.weight_dim != -1:
loss_fuse_depth = self.loss_depth(
sig_alpha * pos_bbox_preds[:, 2] +
(1 - sig_alpha) * pos_prob_depth_preds,
pos_bbox_targets_3d[:, 2],
sigma=pos_weights[:, 0],
weight=bbox_weights[:, 2],
avg_factor=equal_weights.sum())
else:
loss_fuse_depth = self.loss_depth(
sig_alpha * pos_bbox_preds[:, 2] +
(1 - sig_alpha) * pos_prob_depth_preds,
pos_bbox_targets_3d[:, 2],
weight=bbox_weights[:, 2],
avg_factor=equal_weights.sum())
loss_dict['loss_depth'] = loss_fuse_depth
proj_bbox2d_inputs += (pos_depth_cls_preds, )
if self.pred_keypoints:
# use smoothL1 to compute consistency loss for keypoints
# normalize the offsets with strides
proj_bbox2d_preds, pos_decoded_bbox2d_preds, kpts_targets = \
self.get_proj_bbox2d(*proj_bbox2d_inputs, with_kpts=True)
loss_dict['loss_kpts'] = self.loss_bbox(
pos_bbox_preds[:, self.kpts_start:self.kpts_start + 16],
kpts_targets,
weight=bbox_weights[:,
self.kpts_start:self.kpts_start + 16],
avg_factor=equal_weights.sum())
if self.pred_bbox2d:
loss_dict['loss_bbox2d'] = self.loss_bbox2d(
pos_bbox_preds[:, -4:],
pos_bbox_targets_3d[:, -4:],
weight=bbox_weights[:, -4:],
avg_factor=equal_weights.sum())
if not self.pred_keypoints:
proj_bbox2d_preds, pos_decoded_bbox2d_preds = \
self.get_proj_bbox2d(*proj_bbox2d_inputs)
loss_dict['loss_consistency'] = self.loss_consistency(
proj_bbox2d_preds,
pos_decoded_bbox2d_preds,
weight=bbox_weights[:, -4:],
avg_factor=equal_weights.sum())
loss_dict['loss_centerness'] = self.loss_centerness(
pos_centerness, pos_centerness_targets)
# attribute classification loss
if self.pred_attrs:
loss_dict['loss_attr'] = self.loss_attr(
pos_attr_preds,
pos_attr_targets,
pos_centerness_targets,
avg_factor=pos_centerness_targets.sum())
else:
# need absolute due to possible negative delta x/y
loss_dict['loss_offset'] = pos_bbox_preds[:, :2].sum()
loss_dict['loss_size'] = pos_bbox_preds[:, 3:6].sum()
loss_dict['loss_rotsin'] = pos_bbox_preds[:, 6].sum()
loss_dict['loss_depth'] = pos_bbox_preds[:, 2].sum()
if self.pred_velo:
loss_dict['loss_velo'] = pos_bbox_preds[:, 7:9].sum()
if self.pred_keypoints:
loss_dict['loss_kpts'] = pos_bbox_preds[:,
self.kpts_start:self.
kpts_start + 16].sum()
if self.pred_bbox2d:
loss_dict['loss_bbox2d'] = pos_bbox_preds[:, -4:].sum()
loss_dict['loss_consistency'] = pos_bbox_preds[:, -4:].sum()
loss_dict['loss_centerness'] = pos_centerness.sum()
if self.use_direction_classifier:
loss_dict['loss_dir'] = pos_dir_cls_preds.sum()
if self.use_depth_classifier:
sig_alpha = torch.sigmoid(self.fuse_lambda)
loss_fuse_depth = \
sig_alpha * pos_bbox_preds[:, 2].sum() + \
(1 - sig_alpha) * pos_depth_cls_preds.sum()
if self.weight_dim != -1:
loss_fuse_depth *= torch.exp(-pos_weights[:, 0].sum())
loss_dict['loss_depth'] = loss_fuse_depth
if self.pred_attrs:
loss_dict['loss_attr'] = pos_attr_preds.sum()
return loss_dict
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
depth_cls_preds,
weights,
attr_preds,
centernesses,
img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * 4, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
depth_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * self.num_depth_cls.
weights (list[Tensor]): Location-aware weights for each scale
level, each is a 4D-tensor, the channel number is
num_points * self.weight_dim.
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
centernesses (list[Tensor]): Centerness for each scale level with
shape (N, num_points * 1, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config, optional): Test / postprocessing configuration,
if None, test_cfg would be used. Defaults to None.
rescale (bool, optional): If True, return boxes in original image
space. Defaults to None.
Returns:
list[tuple[Tensor]]: Each item in result_list is a tuple, which
consists of predicted 3D boxes, scores, labels, attributes and
2D boxes (if necessary).
"""
assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
len(depth_cls_preds) == len(weights) == len(centernesses) == \
len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
if self.use_direction_classifier:
dir_cls_pred_list = [
dir_cls_preds[i][img_id].detach()
for i in range(num_levels)
]
else:
dir_cls_pred_list = [
cls_scores[i][img_id].new_full(
[2, *cls_scores[i][img_id].shape[1:]], 0).detach()
for i in range(num_levels)
]
if self.use_depth_classifier:
depth_cls_pred_list = [
depth_cls_preds[i][img_id].detach()
for i in range(num_levels)
]
else:
depth_cls_pred_list = [
cls_scores[i][img_id].new_full(
[self.num_depth_cls, *cls_scores[i][img_id].shape[1:]],
0).detach() for i in range(num_levels)
]
if self.weight_dim != -1:
weight_list = [
weights[i][img_id].detach() for i in range(num_levels)
]
else:
weight_list = [
cls_scores[i][img_id].new_full(
[1, *cls_scores[i][img_id].shape[1:]], 0).detach()
for i in range(num_levels)
]
if self.pred_attrs:
attr_pred_list = [
attr_preds[i][img_id].detach() for i in range(num_levels)
]
else:
attr_pred_list = [
cls_scores[i][img_id].new_full(
[self.num_attrs, *cls_scores[i][img_id].shape[1:]],
self.attr_background_label).detach()
for i in range(num_levels)
]
centerness_pred_list = [
centernesses[i][img_id].detach() for i in range(num_levels)
]
input_meta = img_metas[img_id]
det_bboxes = self._get_bboxes_single(
cls_score_list, bbox_pred_list, dir_cls_pred_list,
depth_cls_pred_list, weight_list, attr_pred_list,
centerness_pred_list, mlvl_points, input_meta, cfg, rescale)
result_list.append(det_bboxes)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
dir_cls_preds,
depth_cls_preds,
weights,
attr_preds,
centernesses,
mlvl_points,
input_meta,
cfg,
rescale=False):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for a single scale level
Has shape (num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for a single scale
level with shape (num_points * bbox_code_size, H, W).
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on a single scale level with shape
(num_points * 2, H, W)
depth_cls_preds (list[Tensor]): Box scores for probabilistic depth
predictions on a single scale level with shape
(num_points * self.num_depth_cls, H, W)
weights (list[Tensor]): Location-aware weight maps on a single
scale level with shape (num_points * self.weight_dim, H, W).
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
centernesses (list[Tensor]): Centerness for a single scale level
with shape (num_points, H, W).
mlvl_points (list[Tensor]): Box reference for a single scale level
with shape (num_total_points, 2).
input_meta (dict): Metadata of input image.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool, optional): If True, return boxes in original image
space. Defaults to False.
Returns:
tuples[Tensor]: Predicted 3D boxes, scores, labels, attributes and
2D boxes (if necessary).
"""
view = np.array(input_meta['cam2img'])
scale_factor = input_meta['scale_factor']
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_centers2d = []
mlvl_bboxes = []
mlvl_scores = []
mlvl_dir_scores = []
mlvl_attr_scores = []
mlvl_centerness = []
mlvl_depth_cls_scores = []
mlvl_depth_uncertainty = []
mlvl_bboxes2d = None
if self.pred_bbox2d:
mlvl_bboxes2d = []
for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
attr_pred, centerness, points in zip(
cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds,
weights, attr_preds, centernesses, mlvl_points):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
depth_cls_pred = depth_cls_pred.permute(1, 2, 0).reshape(
-1, self.num_depth_cls)
depth_cls_score = F.softmax(
depth_cls_pred, dim=-1).topk(
k=2, dim=-1)[0].mean(dim=-1)
if self.weight_dim != -1:
weight = weight.permute(1, 2, 0).reshape(-1, self.weight_dim)
else:
weight = weight.permute(1, 2, 0).reshape(-1, 1)
depth_uncertainty = torch.exp(-weight[:, -1])
attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
attr_score = torch.max(attr_pred, dim=-1)[1]
centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
bbox_pred = bbox_pred.permute(1, 2,
0).reshape(-1,
sum(self.group_reg_dims))
bbox_pred3d = bbox_pred[:, :self.bbox_coder.bbox_code_size]
if self.pred_bbox2d:
bbox_pred2d = bbox_pred[:, -4:]
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
merged_scores = scores * centerness[:, None]
if self.use_depth_classifier:
merged_scores *= depth_cls_score[:, None]
if self.weight_dim != -1:
merged_scores *= depth_uncertainty[:, None]
max_scores, _ = merged_scores.max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred3d = bbox_pred3d[topk_inds, :]
scores = scores[topk_inds, :]
dir_cls_pred = dir_cls_pred[topk_inds, :]
depth_cls_pred = depth_cls_pred[topk_inds, :]
centerness = centerness[topk_inds]
dir_cls_score = dir_cls_score[topk_inds]
depth_cls_score = depth_cls_score[topk_inds]
depth_uncertainty = depth_uncertainty[topk_inds]
attr_score = attr_score[topk_inds]
if self.pred_bbox2d:
bbox_pred2d = bbox_pred2d[topk_inds, :]
# change the offset to actual center predictions
bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
if rescale:
bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor(
scale_factor)
if self.pred_bbox2d:
bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor)
if self.use_depth_classifier:
prob_depth_pred = self.bbox_coder.decode_prob_depth(
depth_cls_pred, self.depth_range, self.depth_unit,
self.division, self.num_depth_cls)
sig_alpha = torch.sigmoid(self.fuse_lambda)
bbox_pred3d[:, 2] = sig_alpha * bbox_pred3d[:, 2] + \
(1 - sig_alpha) * prob_depth_pred
pred_center2d = bbox_pred3d[:, :3].clone()
bbox_pred3d[:, :3] = points_img2cam(bbox_pred3d[:, :3], view)
mlvl_centers2d.append(pred_center2d)
mlvl_bboxes.append(bbox_pred3d)
mlvl_scores.append(scores)
mlvl_dir_scores.append(dir_cls_score)
mlvl_depth_cls_scores.append(depth_cls_score)
mlvl_attr_scores.append(attr_score)
mlvl_centerness.append(centerness)
mlvl_depth_uncertainty.append(depth_uncertainty)
if self.pred_bbox2d:
bbox_pred2d = distance2bbox(
points, bbox_pred2d, max_shape=input_meta['img_shape'])
mlvl_bboxes2d.append(bbox_pred2d)
mlvl_centers2d = torch.cat(mlvl_centers2d)
mlvl_bboxes = torch.cat(mlvl_bboxes)
mlvl_dir_scores = torch.cat(mlvl_dir_scores)
if self.pred_bbox2d:
mlvl_bboxes2d = torch.cat(mlvl_bboxes2d)
# change local yaw to global yaw for 3D nms
cam2img = torch.eye(
4, dtype=mlvl_centers2d.dtype, device=mlvl_centers2d.device)
cam2img[:view.shape[0], :view.shape[1]] = \
mlvl_centers2d.new_tensor(view)
mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
mlvl_dir_scores,
self.dir_offset, cam2img)
mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
mlvl_bboxes,
box_dim=self.bbox_coder.bbox_code_size,
origin=(0.5, 0.5, 0.5)).bev)
mlvl_scores = torch.cat(mlvl_scores)
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
mlvl_attr_scores = torch.cat(mlvl_attr_scores)
mlvl_centerness = torch.cat(mlvl_centerness)
# no scale_factors in box3d_multiclass_nms
# Then we multiply it from outside
mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
if self.use_depth_classifier: # multiply the depth confidence
mlvl_depth_cls_scores = torch.cat(mlvl_depth_cls_scores)
mlvl_nms_scores *= mlvl_depth_cls_scores[:, None]
if self.weight_dim != -1:
mlvl_depth_uncertainty = torch.cat(mlvl_depth_uncertainty)
mlvl_nms_scores *= mlvl_depth_uncertainty[:, None]
results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_nms_scores, cfg.score_thr,
cfg.max_per_img, cfg, mlvl_dir_scores,
mlvl_attr_scores, mlvl_bboxes2d)
bboxes, scores, labels, dir_scores, attrs = results[0:5]
attrs = attrs.to(labels.dtype) # change data type to int
bboxes = input_meta['box_type_3d'](
bboxes,
box_dim=self.bbox_coder.bbox_code_size,
origin=(0.5, 0.5, 0.5))
# Note that the predictions use origin (0.5, 0.5, 0.5)
# Due to the ground truth centers2d are the gravity center of objects
# v0.10.0 fix inplace operation to the input tensor of cam_box3d
# So here we also need to add origin=(0.5, 0.5, 0.5)
if not self.pred_attrs:
attrs = None
outputs = (bboxes, scores, labels, attrs)
if self.pred_bbox2d:
bboxes2d = results[-1]
bboxes2d = torch.cat([bboxes2d, scores[:, None]], dim=1)
outputs = outputs + (bboxes2d, )
return outputs
def get_targets(self, points, gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
depths_list, attr_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
image, each has shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
box, each has shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
each has shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
attr_labels_list (list[Tensor]): Attribute labels of each box,
each has shape (num_gt,).
Returns:
tuple:
concat_lvl_labels (list[Tensor]): Labels of each level. \
concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
level.
"""
assert len(points) == len(self.regress_ranges)
num_levels = len(points)
# expand regress ranges to align with points
expanded_regress_ranges = [
points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
points[i]) for i in range(num_levels)
]
# concat all levels points and regress ranges
concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
concat_points = torch.cat(points, dim=0)
# the number of points per img, per lvl
num_points = [center.size(0) for center in points]
if attr_labels_list is None:
attr_labels_list = [
gt_labels.new_full(gt_labels.shape, self.attr_background_label)
for gt_labels in gt_labels_list
]
# get labels and bbox_targets of each image
_, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \
centerness_targets_list, attr_targets_list = multi_apply(
self._get_target_single,
gt_bboxes_list,
gt_labels_list,
gt_bboxes_3d_list,
gt_labels_3d_list,
centers2d_list,
depths_list,
attr_labels_list,
points=concat_points,
regress_ranges=concat_regress_ranges,
num_points_per_lvl=num_points)
# split to per img, per level
bbox_targets_list = [
bbox_targets.split(num_points, 0)
for bbox_targets in bbox_targets_list
]
labels_3d_list = [
labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
]
bbox_targets_3d_list = [
bbox_targets_3d.split(num_points, 0)
for bbox_targets_3d in bbox_targets_3d_list
]
centerness_targets_list = [
centerness_targets.split(num_points, 0)
for centerness_targets in centerness_targets_list
]
attr_targets_list = [
attr_targets.split(num_points, 0)
for attr_targets in attr_targets_list
]
# concat per level image
concat_lvl_labels_3d = []
concat_lvl_bbox_targets_3d = []
concat_lvl_centerness_targets = []
concat_lvl_attr_targets = []
for i in range(num_levels):
concat_lvl_labels_3d.append(
torch.cat([labels[i] for labels in labels_3d_list]))
concat_lvl_centerness_targets.append(
torch.cat([
centerness_targets[i]
for centerness_targets in centerness_targets_list
]))
bbox_targets_3d = torch.cat([
bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
])
if self.pred_bbox2d:
bbox_targets = torch.cat(
[bbox_targets[i] for bbox_targets in bbox_targets_list])
bbox_targets_3d = torch.cat([bbox_targets_3d, bbox_targets],
dim=1)
concat_lvl_attr_targets.append(
torch.cat(
[attr_targets[i] for attr_targets in attr_targets_list]))
if self.norm_on_bbox:
bbox_targets_3d[:, :2] = \
bbox_targets_3d[:, :2] / self.strides[i]
if self.pred_bbox2d:
bbox_targets_3d[:, -4:] = \
bbox_targets_3d[:, -4:] / self.strides[i]
concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
concat_lvl_centerness_targets, concat_lvl_attr_targets
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmcv.runner import BaseModule, force_fp32
from torch import nn as nn
from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes,
LiDARInstance3DBoxes)
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
from mmdet.core import build_bbox_coder, multi_apply
from mmdet.models import HEADS, build_loss
@HEADS.register_module()
class PointRPNHead(BaseModule):
"""RPN module for PointRCNN.
Args:
num_classes (int): Number of classes.
train_cfg (dict): Train configs.
test_cfg (dict): Test configs.
pred_layer_cfg (dict, optional): Config of classfication and
regression prediction layers. Defaults to None.
enlarge_width (float, optional): Enlarge bbox for each side to ignore
close points. Defaults to 0.1.
cls_loss (dict, optional): Config of direction classification loss.
Defaults to None.
bbox_loss (dict, optional): Config of localization loss.
Defaults to None.
bbox_coder (dict, optional): Config dict of box coders.
Defaults to None.
init_cfg (dict, optional): Config of initialization. Defaults to None.
"""
def __init__(self,
num_classes,
train_cfg,
test_cfg,
pred_layer_cfg=None,
enlarge_width=0.1,
cls_loss=None,
bbox_loss=None,
bbox_coder=None,
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.num_classes = num_classes
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.enlarge_width = enlarge_width
# build loss function
self.bbox_loss = build_loss(bbox_loss)
self.cls_loss = build_loss(cls_loss)
# build box coder
self.bbox_coder = build_bbox_coder(bbox_coder)
# build pred conv
self.cls_layers = self._make_fc_layers(
fc_cfg=pred_layer_cfg.cls_linear_channels,
input_channels=pred_layer_cfg.in_channels,
output_channels=self._get_cls_out_channels())
self.reg_layers = self._make_fc_layers(
fc_cfg=pred_layer_cfg.reg_linear_channels,
input_channels=pred_layer_cfg.in_channels,
output_channels=self._get_reg_out_channels())
def _make_fc_layers(self, fc_cfg, input_channels, output_channels):
"""Make fully connect layers.
Args:
fc_cfg (dict): Config of fully connect.
input_channels (int): Input channels for fc_layers.
output_channels (int): Input channels for fc_layers.
Returns:
nn.Sequential: Fully connect layers.
"""
fc_layers = []
c_in = input_channels
for k in range(0, fc_cfg.__len__()):
fc_layers.extend([
nn.Linear(c_in, fc_cfg[k], bias=False),
nn.BatchNorm1d(fc_cfg[k]),
nn.ReLU(),
])
c_in = fc_cfg[k]
fc_layers.append(nn.Linear(c_in, output_channels, bias=True))
return nn.Sequential(*fc_layers)
def _get_cls_out_channels(self):
"""Return the channel number of classification outputs."""
# Class numbers (k) + objectness (1)
return self.num_classes
def _get_reg_out_channels(self):
"""Return the channel number of regression outputs."""
# Bbox classification and regression
# (center residual (3), size regression (3)
# torch.cos(yaw) (1), torch.sin(yaw) (1)
return self.bbox_coder.code_size
def forward(self, feat_dict):
"""Forward pass.
Args:
feat_dict (dict): Feature dict from backbone.
Returns:
tuple[list[torch.Tensor]]: Predicted boxes and classification
scores.
"""
point_features = feat_dict['fp_features']
point_features = point_features.permute(0, 2, 1).contiguous()
batch_size = point_features.shape[0]
feat_cls = point_features.view(-1, point_features.shape[-1])
feat_reg = point_features.view(-1, point_features.shape[-1])
point_cls_preds = self.cls_layers(feat_cls).reshape(
batch_size, -1, self._get_cls_out_channels())
point_box_preds = self.reg_layers(feat_reg).reshape(
batch_size, -1, self._get_reg_out_channels())
return (point_box_preds, point_cls_preds)
@force_fp32(apply_to=('bbox_preds'))
def loss(self,
bbox_preds,
cls_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
img_metas=None):
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head.
cls_preds (dict): Classification from forward of PointRCNN
RPN_Head.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
img_metas (list[dict], Optional): Contain pcd and img's meta info.
Defaults to None.
Returns:
dict: Losses of PointRCNN RPN module.
"""
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d)
(bbox_targets, mask_targets, positive_mask, negative_mask,
box_loss_weights, point_targets) = targets
# bbox loss
bbox_loss = self.bbox_loss(bbox_preds, bbox_targets,
box_loss_weights.unsqueeze(-1))
# calculate semantic loss
semantic_points = cls_preds.reshape(-1, self.num_classes)
semantic_targets = mask_targets
semantic_targets[negative_mask] = self.num_classes
semantic_points_label = semantic_targets
# for ignore, but now we do not have ignore label
semantic_loss_weight = negative_mask.float() + positive_mask.float()
semantic_loss = self.cls_loss(semantic_points,
semantic_points_label.reshape(-1),
semantic_loss_weight.reshape(-1))
semantic_loss /= positive_mask.float().sum()
losses = dict(bbox_loss=bbox_loss, semantic_loss=semantic_loss)
return losses
def get_targets(self, points, gt_bboxes_3d, gt_labels_3d):
"""Generate targets of PointRCNN RPN head.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
Returns:
tuple[torch.Tensor]: Targets of PointRCNN RPN head.
"""
# find empty example
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
(bbox_targets, mask_targets, positive_mask, negative_mask,
point_targets) = multi_apply(self.get_targets_single, points,
gt_bboxes_3d, gt_labels_3d)
bbox_targets = torch.stack(bbox_targets)
mask_targets = torch.stack(mask_targets)
positive_mask = torch.stack(positive_mask)
negative_mask = torch.stack(negative_mask)
box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
return (bbox_targets, mask_targets, positive_mask, negative_mask,
box_loss_weights, point_targets)
def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d):
"""Generate targets of PointRCNN RPN head for single batch.
Args:
points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch.
Returns:
tuple[torch.Tensor]: Targets of ssd3d head.
"""
gt_bboxes_3d = gt_bboxes_3d.to(points.device)
valid_gt = gt_labels_3d != -1
gt_bboxes_3d = gt_bboxes_3d[valid_gt]
gt_labels_3d = gt_labels_3d[valid_gt]
# transform the bbox coordinate to the pointcloud coordinate
gt_bboxes_3d_tensor = gt_bboxes_3d.tensor.clone()
gt_bboxes_3d_tensor[..., 2] += gt_bboxes_3d_tensor[..., 5] / 2
points_mask, assignment = self._assign_targets_by_points_inside(
gt_bboxes_3d, points)
gt_bboxes_3d_tensor = gt_bboxes_3d_tensor[assignment]
mask_targets = gt_labels_3d[assignment]
bbox_targets = self.bbox_coder.encode(gt_bboxes_3d_tensor,
points[..., 0:3], mask_targets)
positive_mask = (points_mask.max(1)[0] > 0)
negative_mask = (points_mask.max(1)[0] == 0)
# add ignore_mask
extend_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(self.enlarge_width)
points_mask, _ = self._assign_targets_by_points_inside(
extend_gt_bboxes_3d, points)
negative_mask = (points_mask.max(1)[0] == 0)
point_targets = points[..., 0:3]
return (bbox_targets, mask_targets, positive_mask, negative_mask,
point_targets)
def get_bboxes(self,
points,
bbox_preds,
cls_preds,
input_metas,
rescale=False):
"""Generate bboxes from RPN head predictions.
Args:
points (torch.Tensor): Input points.
bbox_preds (dict): Regression predictions from PointRCNN head.
cls_preds (dict): Class scores predictions from PointRCNN head.
input_metas (list[dict]): Point cloud and image's meta info.
rescale (bool, optional): Whether to rescale bboxes.
Defaults to False.
Returns:
list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
"""
sem_scores = cls_preds.sigmoid()
obj_scores = sem_scores.max(-1)[0]
object_class = sem_scores.argmax(dim=-1)
batch_size = sem_scores.shape[0]
results = list()
for b in range(batch_size):
bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3],
object_class[b])
bbox_selected, score_selected, labels, cls_preds_selected = \
self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d,
points[b, ..., :3], input_metas[b])
bbox = input_metas[b]['box_type_3d'](
bbox_selected.clone(),
box_dim=bbox_selected.shape[-1],
with_yaw=True)
results.append((bbox, score_selected, labels, cls_preds_selected))
return results
def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points,
input_meta):
"""Class agnostic nms.
Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): Semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes.
Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels.
"""
nms_cfg = self.test_cfg.nms_cfg if not self.training \
else self.train_cfg.nms_cfg
if nms_cfg.use_rotate_nms:
nms_func = nms_gpu
else:
nms_func = nms_normal_gpu
num_bbox = bbox.shape[0]
bbox = input_meta['box_type_3d'](
bbox.clone(),
box_dim=bbox.shape[-1],
with_yaw=True,
origin=(0.5, 0.5, 0.5))
if isinstance(bbox, LiDARInstance3DBoxes):
box_idx = bbox.points_in_boxes(points)
box_indices = box_idx.new_zeros([num_bbox + 1])
box_idx[box_idx == -1] = num_bbox
box_indices.scatter_add_(0, box_idx.long(),
box_idx.new_ones(box_idx.shape))
box_indices = box_indices[:-1]
nonempty_box_mask = box_indices >= 0
elif isinstance(bbox, DepthInstance3DBoxes):
box_indices = bbox.points_in_boxes(points)
nonempty_box_mask = box_indices.T.sum(1) >= 0
else:
raise NotImplementedError('Unsupported bbox type!')
bbox = bbox.tensor[nonempty_box_mask]
if self.test_cfg.score_thr is not None:
score_thr = self.test_cfg.score_thr
keep = (obj_scores >= score_thr)
obj_scores = obj_scores[keep]
sem_scores = sem_scores[keep]
bbox = bbox[keep]
if obj_scores.shape[0] > 0:
topk = min(nms_cfg.nms_pre, obj_scores.shape[0])
obj_scores_nms, indices = torch.topk(obj_scores, k=topk)
bbox_for_nms = bbox[indices]
sem_scores_nms = sem_scores[indices]
keep = nms_func(bbox_for_nms[:, 0:7], obj_scores_nms,
nms_cfg.iou_thr)
keep = keep[:nms_cfg.nms_post]
bbox_selected = bbox_for_nms[keep]
score_selected = obj_scores_nms[keep]
cls_preds = sem_scores_nms[keep]
labels = torch.argmax(cls_preds, -1)
return bbox_selected, score_selected, labels, cls_preds
def _assign_targets_by_points_inside(self, bboxes_3d, points):
"""Compute assignment by checking whether point is inside bbox.
Args:
bboxes_3d (:obj:`BaseInstance3DBoxes`): Instance of bounding boxes.
points (torch.Tensor): Points of a batch.
Returns:
tuple[torch.Tensor]: Flags indicating whether each point is
inside bbox and the index of box where each point are in.
"""
# TODO: align points_in_boxes function in each box_structures
num_bbox = bboxes_3d.tensor.shape[0]
if isinstance(bboxes_3d, LiDARInstance3DBoxes):
assignment = bboxes_3d.points_in_boxes(points[:, 0:3]).long()
points_mask = assignment.new_zeros(
[assignment.shape[0], num_bbox + 1])
assignment[assignment == -1] = num_bbox
points_mask.scatter_(1, assignment.unsqueeze(1), 1)
points_mask = points_mask[:, :-1]
assignment[assignment == num_bbox] = num_bbox - 1
elif isinstance(bboxes_3d, DepthInstance3DBoxes):
points_mask = bboxes_3d.points_in_boxes(points)
assignment = points_mask.argmax(dim=-1)
else:
raise NotImplementedError('Unsupported bbox type!')
return points_mask, assignment
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import warnings
import numpy as np import numpy as np
import torch import torch
import warnings
from mmcv.cnn import ConvModule from mmcv.cnn import ConvModule
from mmcv.runner import BaseModule from mmcv.runner import BaseModule
from torch import nn as nn from torch import nn as nn
...@@ -30,15 +31,17 @@ class BaseShapeHead(BaseModule): ...@@ -30,15 +31,17 @@ class BaseShapeHead(BaseModule):
num_base_anchors (int): Number of anchors per location. num_base_anchors (int): Number of anchors per location.
box_code_size (int): The dimension of boxes to be encoded. box_code_size (int): The dimension of boxes to be encoded.
in_channels (int): Input channels for convolutional layers. in_channels (int): Input channels for convolutional layers.
shared_conv_channels (tuple): Channels for shared convolutional \ shared_conv_channels (tuple, optional): Channels for shared
layers. Default: (64, 64). \ convolutional layers. Default: (64, 64).
shared_conv_strides (tuple): Strides for shared convolutional \ shared_conv_strides (tuple, optional): Strides for shared
layers. Default: (1, 1). convolutional layers. Default: (1, 1).
use_direction_classifier (bool, optional): Whether to use direction \ use_direction_classifier (bool, optional): Whether to use direction
classifier. Default: True. classifier. Default: True.
conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d') conv_cfg (dict, optional): Config of conv layer.
norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d'). Default: dict(type='Conv2d')
bias (bool|str, optional): Type of bias. Default: False. norm_cfg (dict, optional): Config of norm layer.
Default: dict(type='BN2d').
bias (bool | str, optional): Type of bias. Default: False.
""" """
def __init__(self, def __init__(self,
...@@ -127,11 +130,11 @@ class BaseShapeHead(BaseModule): ...@@ -127,11 +130,11 @@ class BaseShapeHead(BaseModule):
[B, C, H, W]. [B, C, H, W].
Returns: Returns:
dict[torch.Tensor]: Contain score of each class, bbox \ dict[torch.Tensor]: Contain score of each class, bbox
regression and direction classification predictions. \ regression and direction classification predictions.
Note that all the returned tensors are reshaped as \ Note that all the returned tensors are reshaped as
[bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins]. \ [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins].
It is more convenient to concat anchors for different \ It is more convenient to concat anchors for different
classes even though they have different feature map sizes. classes even though they have different feature map sizes.
""" """
x = self.shared_conv(x) x = self.shared_conv(x)
...@@ -168,9 +171,9 @@ class ShapeAwareHead(Anchor3DHead): ...@@ -168,9 +171,9 @@ class ShapeAwareHead(Anchor3DHead):
Args: Args:
tasks (dict): Shape-aware groups of multi-class objects. tasks (dict): Shape-aware groups of multi-class objects.
assign_per_class (bool, optional): Whether to do assignment for each \ assign_per_class (bool, optional): Whether to do assignment for each
class. Default: True. class. Default: True.
kwargs (dict): Other arguments are the same as those in \ kwargs (dict): Other arguments are the same as those in
:class:`Anchor3DHead`. :class:`Anchor3DHead`.
""" """
...@@ -217,7 +220,7 @@ class ShapeAwareHead(Anchor3DHead): ...@@ -217,7 +220,7 @@ class ShapeAwareHead(Anchor3DHead):
Args: Args:
x (torch.Tensor): Input features. x (torch.Tensor): Input features.
Returns: Returns:
tuple[torch.Tensor]: Contain score of each class, bbox \ tuple[torch.Tensor]: Contain score of each class, bbox
regression and direction classification predictions. regression and direction classification predictions.
""" """
results = [] results = []
...@@ -263,7 +266,7 @@ class ShapeAwareHead(Anchor3DHead): ...@@ -263,7 +266,7 @@ class ShapeAwareHead(Anchor3DHead):
num_total_samples (int): The number of valid samples. num_total_samples (int): The number of valid samples.
Returns: Returns:
tuple[torch.Tensor]: Losses of class, bbox \ tuple[torch.Tensor]: Losses of class, bbox
and direction, respectively. and direction, respectively.
""" """
# classification loss # classification loss
...@@ -325,16 +328,16 @@ class ShapeAwareHead(Anchor3DHead): ...@@ -325,16 +328,16 @@ class ShapeAwareHead(Anchor3DHead):
of each sample. of each sample.
gt_labels (list[torch.Tensor]): Gt labels of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample.
input_metas (list[dict]): Contain pcd and img's meta info. input_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify gt_bboxes_ignore (list[torch.Tensor]): Specify
which bounding. which bounding.
Returns: Returns:
dict[str, list[torch.Tensor]]: Classification, bbox, and \ dict[str, list[torch.Tensor]]: Classification, bbox, and
direction losses of each level. direction losses of each level.
- loss_cls (list[torch.Tensor]): Classification losses. - loss_cls (list[torch.Tensor]): Classification losses.
- loss_bbox (list[torch.Tensor]): Box regression losses. - loss_bbox (list[torch.Tensor]): Box regression losses.
- loss_dir (list[torch.Tensor]): Direction classification \ - loss_dir (list[torch.Tensor]): Direction classification
losses. losses.
""" """
device = cls_scores[0].device device = cls_scores[0].device
...@@ -388,7 +391,7 @@ class ShapeAwareHead(Anchor3DHead): ...@@ -388,7 +391,7 @@ class ShapeAwareHead(Anchor3DHead):
dir_cls_preds (list[torch.Tensor]): Multi-level direction dir_cls_preds (list[torch.Tensor]): Multi-level direction
class predictions. class predictions.
input_metas (list[dict]): Contain pcd and img's meta info. input_metas (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config. cfg (:obj:`ConfigDict`, optional): Training or testing config.
Default: None. Default: None.
rescale (list[torch.Tensor], optional): Whether to rescale bbox. rescale (list[torch.Tensor], optional): Whether to rescale bbox.
Default: False. Default: False.
...@@ -443,8 +446,8 @@ class ShapeAwareHead(Anchor3DHead): ...@@ -443,8 +446,8 @@ class ShapeAwareHead(Anchor3DHead):
mlvl_anchors (List[torch.Tensor]): Multi-level anchors mlvl_anchors (List[torch.Tensor]): Multi-level anchors
in single batch. in single batch.
input_meta (list[dict]): Contain pcd and img's meta info. input_meta (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config. cfg (:obj:`ConfigDict`): Training or testing config.
rescale (list[torch.Tensor], optional): whether to rescale bbox. \ rescale (list[torch.Tensor], optional): whether to rescale bbox.
Default: False. Default: False.
Returns: Returns:
......
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch.nn import functional as F
from mmdet.core import multi_apply
from mmdet.core.bbox.builder import build_bbox_coder
from mmdet.models.builder import HEADS
from mmdet.models.utils import gaussian_radius, gen_gaussian_target
from mmdet.models.utils.gaussian_target import (get_local_maximum,
get_topk_from_heatmap,
transpose_and_gather_feat)
from .anchor_free_mono3d_head import AnchorFreeMono3DHead
@HEADS.register_module()
class SMOKEMono3DHead(AnchorFreeMono3DHead):
r"""Anchor-free head used in `SMOKE <https://arxiv.org/abs/2002.10111>`_
.. code-block:: none
/-----> 3*3 conv -----> 1*1 conv -----> cls
feature
\-----> 3*3 conv -----> 1*1 conv -----> reg
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
dim_channel (list[int]): indices of dimension offset preds in
regression heatmap channels.
ori_channel (list[int]): indices of orientation offset pred in
regression heatmap channels.
bbox_coder (:obj:`CameraInstance3DBoxes`): Bbox coder
for encoding and decoding boxes.
loss_cls (dict, optional): Config of classification loss.
Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
loss_bbox (dict, optional): Config of localization loss.
Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0).
loss_dir (dict, optional): Config of direction classification loss.
In SMOKE, Default: None.
loss_attr (dict, optional): Config of attribute classification loss.
In SMOKE, Default: None.
loss_centerness (dict): Config of centerness loss.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
init_cfg (dict): Initialization config dict. Default: None.
""" # noqa: E501
def __init__(self,
num_classes,
in_channels,
dim_channel,
ori_channel,
bbox_coder,
loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=0.1),
loss_dir=None,
loss_attr=None,
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
init_cfg=None,
**kwargs):
super().__init__(
num_classes,
in_channels,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_dir=loss_dir,
loss_attr=loss_attr,
norm_cfg=norm_cfg,
init_cfg=init_cfg,
**kwargs)
self.dim_channel = dim_channel
self.ori_channel = ori_channel
self.bbox_coder = build_bbox_coder(bbox_coder)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
"""
return multi_apply(self.forward_single, feats)
def forward_single(self, x):
"""Forward features of a single scale level.
Args:
x (Tensor): Input feature map.
Returns:
tuple: Scores for each class, bbox of input feature maps.
"""
cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
super().forward_single(x)
cls_score = cls_score.sigmoid() # turn to 0-1
cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
# (N, C, H, W)
offset_dims = bbox_pred[:, self.dim_channel, ...]
bbox_pred[:, self.dim_channel, ...] = offset_dims.sigmoid() - 0.5
# (N, C, H, W)
vector_ori = bbox_pred[:, self.ori_channel, ...]
bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori)
return cls_score, bbox_pred
def get_bboxes(self, cls_scores, bbox_preds, img_metas, rescale=None):
"""Generate bboxes from bbox head predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
bbox_preds (list[Tensor]): Box regression for each scale.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If True, return boxes in original image space.
Returns:
list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
Each item in result_list is 4-tuple.
"""
assert len(cls_scores) == len(bbox_preds) == 1
cam2imgs = torch.stack([
cls_scores[0].new_tensor(img_meta['cam2img'])
for img_meta in img_metas
])
trans_mats = torch.stack([
cls_scores[0].new_tensor(img_meta['trans_mat'])
for img_meta in img_metas
])
batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
cls_scores[0],
bbox_preds[0],
img_metas,
cam2imgs=cam2imgs,
trans_mats=trans_mats,
topk=100,
kernel=3)
result_list = []
for img_id in range(len(img_metas)):
bboxes = batch_bboxes[img_id]
scores = batch_scores[img_id]
labels = batch_topk_labels[img_id]
keep_idx = scores > 0.25
bboxes = bboxes[keep_idx]
scores = scores[keep_idx]
labels = labels[keep_idx]
bboxes = img_metas[img_id]['box_type_3d'](
bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
attrs = None
result_list.append((bboxes, scores, labels, attrs))
return result_list
def decode_heatmap(self,
cls_score,
reg_pred,
img_metas,
cam2imgs,
trans_mats,
topk=100,
kernel=3):
"""Transform outputs into detections raw bbox predictions.
Args:
class_score (Tensor): Center predict heatmap,
shape (B, num_classes, H, W).
reg_pred (Tensor): Box regression map.
shape (B, channel, H , W).
img_metas (List[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cam2imgs (Tensor): Camera intrinsic matrixs.
shape (B, 4, 4)
trans_mats (Tensor): Transformation matrix from original image
to feature map.
shape: (batch, 3, 3)
topk (int): Get top k center keypoints from heatmap. Default 100.
kernel (int): Max pooling kernel for extract local maximum pixels.
Default 3.
Returns:
tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
the following Tensors:
- batch_bboxes (Tensor): Coords of each 3D box.
shape (B, k, 7)
- batch_scores (Tensor): Scores of each 3D box.
shape (B, k)
- batch_topk_labels (Tensor): Categories of each 3D box.
shape (B, k)
"""
img_h, img_w = img_metas[0]['pad_shape'][:2]
bs, _, feat_h, feat_w = cls_score.shape
center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
*batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
center_heatmap_pred, k=topk)
batch_scores, batch_index, batch_topk_labels = batch_dets
regression = transpose_and_gather_feat(reg_pred, batch_index)
regression = regression.view(-1, 8)
points = torch.cat([topk_xs.view(-1, 1),
topk_ys.view(-1, 1).float()],
dim=1)
locations, dimensions, orientations = self.bbox_coder.decode(
regression, points, batch_topk_labels, cam2imgs, trans_mats)
batch_bboxes = torch.cat((locations, dimensions, orientations), dim=1)
batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size)
return batch_bboxes, batch_scores, batch_topk_labels
def get_predictions(self, labels3d, centers2d, gt_locations, gt_dimensions,
gt_orientations, indices, img_metas, pred_reg):
"""Prepare predictions for computing loss.
Args:
labels3d (Tensor): Labels of each 3D box.
shape (B, max_objs, )
centers2d (Tensor): Coords of each projected 3D box
center on image. shape (B * max_objs, 2)
gt_locations (Tensor): Coords of each 3D box's location.
shape (B * max_objs, 3)
gt_dimensions (Tensor): Dimensions of each 3D box.
shape (N, 3)
gt_orientations (Tensor): Orientation(yaw) of each 3D box.
shape (N, 1)
indices (Tensor): Indices of the existence of the 3D box.
shape (B * max_objs, )
img_metas (list[dict]): Meta information of each image,
e.g., image size, scaling factor, etc.
pre_reg (Tensor): Box regression map.
shape (B, channel, H , W).
Returns:
dict: the dict has components below:
- bbox3d_yaws (:obj:`CameraInstance3DBoxes`):
bbox calculated using pred orientations.
- bbox3d_dims (:obj:`CameraInstance3DBoxes`):
bbox calculated using pred dimensions.
- bbox3d_locs (:obj:`CameraInstance3DBoxes`):
bbox calculated using pred locations.
"""
batch, channel = pred_reg.shape[0], pred_reg.shape[1]
w = pred_reg.shape[3]
cam2imgs = torch.stack([
gt_locations.new_tensor(img_meta['cam2img'])
for img_meta in img_metas
])
trans_mats = torch.stack([
gt_locations.new_tensor(img_meta['trans_mat'])
for img_meta in img_metas
])
centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
centers2d_inds = centers2d_inds.view(batch, -1)
pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
pred_regression_pois = pred_regression.view(-1, channel)
locations, dimensions, orientations = self.bbox_coder.decode(
pred_regression_pois, centers2d, labels3d, cam2imgs, trans_mats,
gt_locations)
locations, dimensions, orientations = locations[indices], dimensions[
indices], orientations[indices]
locations[:, 1] += dimensions[:, 1] / 2
gt_locations = gt_locations[indices]
assert len(locations) == len(gt_locations)
assert len(dimensions) == len(gt_dimensions)
assert len(orientations) == len(gt_orientations)
bbox3d_yaws = self.bbox_coder.encode(gt_locations, gt_dimensions,
orientations, img_metas)
bbox3d_dims = self.bbox_coder.encode(gt_locations, dimensions,
gt_orientations, img_metas)
bbox3d_locs = self.bbox_coder.encode(locations, gt_dimensions,
gt_orientations, img_metas)
pred_bboxes = dict(ori=bbox3d_yaws, dim=bbox3d_dims, loc=bbox3d_locs)
return pred_bboxes
def get_targets(self, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d,
centers2d, feat_shape, img_shape, img_metas):
"""Get training targets for batch images.
Args:
gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
shape (num_gt, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gt,).
gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D Ground
truth bboxes of each image,
shape (num_gt, bbox_code_size).
gt_labels_3d (list[Tensor]): 3D Ground truth labels of each
box, shape (num_gt,).
centers2d (list[Tensor]): Projected 3D centers onto 2D image,
shape (num_gt, 2).
feat_shape (tuple[int]): Feature map shape with value,
shape (B, _, H, W).
img_shape (tuple[int]): Image shape in [h, w] format.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple[Tensor, dict]: The Tensor value is the targets of
center heatmap, the dict has components below:
- gt_centers2d (Tensor): Coords of each projected 3D box
center on image. shape (B * max_objs, 2)
- gt_labels3d (Tensor): Labels of each 3D box.
shape (B, max_objs, )
- indices (Tensor): Indices of the existence of the 3D box.
shape (B * max_objs, )
- affine_indices (Tensor): Indices of the affine of the 3D box.
shape (N, )
- gt_locs (Tensor): Coords of each 3D box's location.
shape (N, 3)
- gt_dims (Tensor): Dimensions of each 3D box.
shape (N, 3)
- gt_yaws (Tensor): Orientation(yaw) of each 3D box.
shape (N, 1)
- gt_cors (Tensor): Coords of the corners of each 3D box.
shape (N, 8, 3)
"""
reg_mask = torch.stack([
gt_bboxes[0].new_tensor(
not img_meta['affine_aug'], dtype=torch.bool)
for img_meta in img_metas
])
img_h, img_w = img_shape[:2]
bs, _, feat_h, feat_w = feat_shape
width_ratio = float(feat_w / img_w) # 1/4
height_ratio = float(feat_h / img_h) # 1/4
assert width_ratio == height_ratio
center_heatmap_target = gt_bboxes[-1].new_zeros(
[bs, self.num_classes, feat_h, feat_w])
gt_centers2d = centers2d.copy()
for batch_id in range(bs):
gt_bbox = gt_bboxes[batch_id]
gt_label = gt_labels[batch_id]
# project centers2d from input image to feat map
gt_center2d = gt_centers2d[batch_id] * width_ratio
for j, center in enumerate(gt_center2d):
center_x_int, center_y_int = center.int()
scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
radius = gaussian_radius([scale_box_h, scale_box_w],
min_overlap=0.7)
radius = max(0, int(radius))
ind = gt_label[j]
gen_gaussian_target(center_heatmap_target[batch_id, ind],
[center_x_int, center_y_int], radius)
avg_factor = max(1, center_heatmap_target.eq(1).sum())
num_ctrs = [center2d.shape[0] for center2d in centers2d]
max_objs = max(num_ctrs)
reg_inds = torch.cat(
[reg_mask[i].repeat(num_ctrs[i]) for i in range(bs)])
inds = torch.zeros((bs, max_objs),
dtype=torch.bool).to(centers2d[0].device)
# put gt 3d bboxes to gpu
gt_bboxes_3d = [
gt_bbox_3d.to(centers2d[0].device) for gt_bbox_3d in gt_bboxes_3d
]
batch_centers2d = centers2d[0].new_zeros((bs, max_objs, 2))
batch_labels_3d = gt_labels_3d[0].new_zeros((bs, max_objs))
batch_gt_locations = \
gt_bboxes_3d[0].tensor.new_zeros((bs, max_objs, 3))
for i in range(bs):
inds[i, :num_ctrs[i]] = 1
batch_centers2d[i, :num_ctrs[i]] = centers2d[i]
batch_labels_3d[i, :num_ctrs[i]] = gt_labels_3d[i]
batch_gt_locations[i, :num_ctrs[i]] = \
gt_bboxes_3d[i].tensor[:, :3]
inds = inds.flatten()
batch_centers2d = batch_centers2d.view(-1, 2) * width_ratio
batch_gt_locations = batch_gt_locations.view(-1, 3)
# filter the empty image, without gt_bboxes_3d
gt_bboxes_3d = [
gt_bbox_3d for gt_bbox_3d in gt_bboxes_3d
if gt_bbox_3d.tensor.shape[0] > 0
]
gt_dimensions = torch.cat(
[gt_bbox_3d.tensor[:, 3:6] for gt_bbox_3d in gt_bboxes_3d])
gt_orientations = torch.cat([
gt_bbox_3d.tensor[:, 6].unsqueeze(-1)
for gt_bbox_3d in gt_bboxes_3d
])
gt_corners = torch.cat(
[gt_bbox_3d.corners for gt_bbox_3d in gt_bboxes_3d])
target_labels = dict(
gt_centers2d=batch_centers2d.long(),
gt_labels3d=batch_labels_3d,
indices=inds,
reg_indices=reg_inds,
gt_locs=batch_gt_locations,
gt_dims=gt_dimensions,
gt_yaws=gt_orientations,
gt_cors=gt_corners)
return center_heatmap_target, avg_factor, target_labels
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
shape (num_gt, 4).
bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
number is bbox_code_size.
shape (B, 7, H, W).
gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box.
shape (num_gts, ).
gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
truth. it is the flipped gt_bboxes
gt_labels_3d (list[Tensor]): Same as gt_labels.
centers2d (list[Tensor]): 2D centers on the image.
shape (num_gts, 2).
depths (list[Tensor]): Depth ground truth.
shape (num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
In kitti it's None.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.
Default: None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == 1
assert attr_labels is None
assert gt_bboxes_ignore is None
center2d_heatmap = cls_scores[0]
pred_reg = bbox_preds[0]
center2d_heatmap_target, avg_factor, target_labels = \
self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d,
center2d_heatmap.shape,
img_metas[0]['pad_shape'],
img_metas)
pred_bboxes = self.get_predictions(
labels3d=target_labels['gt_labels3d'],
centers2d=target_labels['gt_centers2d'],
gt_locations=target_labels['gt_locs'],
gt_dimensions=target_labels['gt_dims'],
gt_orientations=target_labels['gt_yaws'],
indices=target_labels['indices'],
img_metas=img_metas,
pred_reg=pred_reg)
loss_cls = self.loss_cls(
center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
reg_inds = target_labels['reg_indices']
loss_bbox_oris = self.loss_bbox(
pred_bboxes['ori'].corners[reg_inds, ...],
target_labels['gt_cors'][reg_inds, ...])
loss_bbox_dims = self.loss_bbox(
pred_bboxes['dim'].corners[reg_inds, ...],
target_labels['gt_cors'][reg_inds, ...])
loss_bbox_locs = self.loss_bbox(
pred_bboxes['loc'].corners[reg_inds, ...],
target_labels['gt_cors'][reg_inds, ...])
loss_bbox = loss_bbox_dims + loss_bbox_locs + loss_bbox_oris
loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
return loss_dict
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch import torch
from mmcv.ops.nms import batched_nms from mmcv.ops.nms import batched_nms
from mmcv.runner import force_fp32 from mmcv.runner import force_fp32
...@@ -128,15 +127,15 @@ class SSD3DHead(VoteHead): ...@@ -128,15 +127,15 @@ class SSD3DHead(VoteHead):
Args: Args:
bbox_preds (dict): Predictions from forward of SSD3DHead. bbox_preds (dict): Predictions from forward of SSD3DHead.
points (list[torch.Tensor]): Input points. points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each sample. bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise pts_semantic_mask (list[torch.Tensor]): Point-wise
semantic mask. semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise pts_instance_mask (list[torch.Tensor]): Point-wise
instance mask. instance mask.
img_metas (list[dict]): Contain pcd and img's meta info. img_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify gt_bboxes_ignore (list[torch.Tensor]): Specify
which bounding. which bounding.
Returns: Returns:
...@@ -231,12 +230,12 @@ class SSD3DHead(VoteHead): ...@@ -231,12 +230,12 @@ class SSD3DHead(VoteHead):
Args: Args:
points (list[torch.Tensor]): Points of each batch. points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each batch. bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance pts_instance_mask (list[torch.Tensor]): Point-wise instance
label of each batch. label of each batch.
bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head. bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head.
...@@ -320,12 +319,12 @@ class SSD3DHead(VoteHead): ...@@ -320,12 +319,12 @@ class SSD3DHead(VoteHead):
Args: Args:
points (torch.Tensor): Points of each batch. points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
boxes of each batch. boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch. gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic pts_semantic_mask (torch.Tensor): Point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance pts_instance_mask (torch.Tensor): Point-wise instance
label of each batch. label of each batch.
aggregated_points (torch.Tensor): Aggregated points from aggregated_points (torch.Tensor): Aggregated points from
candidate points layer. candidate points layer.
...@@ -392,7 +391,8 @@ class SSD3DHead(VoteHead): ...@@ -392,7 +391,8 @@ class SSD3DHead(VoteHead):
# LiDARInstance3DBoxes and DepthInstance3DBoxes # LiDARInstance3DBoxes and DepthInstance3DBoxes
canonical_xyz = rotation_3d_in_axis( canonical_xyz = rotation_3d_in_axis(
canonical_xyz.unsqueeze(0).transpose(0, 1), canonical_xyz.unsqueeze(0).transpose(0, 1),
-gt_bboxes_3d.yaw[assignment], 2).squeeze(1) -gt_bboxes_3d.yaw[assignment],
axis=2).squeeze(1)
distance_front = torch.clamp( distance_front = torch.clamp(
size_res_targets[:, 0] - canonical_xyz[:, 0], min=0) size_res_targets[:, 0] - canonical_xyz[:, 0], min=0)
distance_back = torch.clamp( distance_back = torch.clamp(
...@@ -441,7 +441,7 @@ class SSD3DHead(VoteHead): ...@@ -441,7 +441,7 @@ class SSD3DHead(VoteHead):
negative_mask) negative_mask)
def get_bboxes(self, points, bbox_preds, input_metas, rescale=False): def get_bboxes(self, points, bbox_preds, input_metas, rescale=False):
"""Generate bboxes from sdd3d head predictions. """Generate bboxes from 3DSSD head predictions.
Args: Args:
points (torch.Tensor): Input points. points (torch.Tensor): Input points.
...@@ -464,9 +464,7 @@ class SSD3DHead(VoteHead): ...@@ -464,9 +464,7 @@ class SSD3DHead(VoteHead):
bbox_selected, score_selected, labels = self.multiclass_nms_single( bbox_selected, score_selected, labels = self.multiclass_nms_single(
obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
input_metas[b]) input_metas[b])
# fix the wrong direction
# To do: remove this ops
bbox_selected[..., 6] += np.pi
bbox = input_metas[b]['box_type_3d']( bbox = input_metas[b]['box_type_3d'](
bbox_selected.clone(), bbox_selected.clone(),
box_dim=bbox_selected.shape[-1], box_dim=bbox_selected.shape[-1],
...@@ -481,7 +479,7 @@ class SSD3DHead(VoteHead): ...@@ -481,7 +479,7 @@ class SSD3DHead(VoteHead):
Args: Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes. obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): semantic class score of bounding boxes. sem_scores (torch.Tensor): Semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes. bbox (torch.Tensor): Predicted bounding boxes.
points (torch.Tensor): Input points. points (torch.Tensor): Input points.
input_meta (dict): Point cloud and image's meta info. input_meta (dict): Point cloud and image's meta info.
...@@ -489,23 +487,14 @@ class SSD3DHead(VoteHead): ...@@ -489,23 +487,14 @@ class SSD3DHead(VoteHead):
Returns: Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels. tuple[torch.Tensor]: Bounding boxes, scores and labels.
""" """
num_bbox = bbox.shape[0]
bbox = input_meta['box_type_3d']( bbox = input_meta['box_type_3d'](
bbox.clone(), bbox.clone(),
box_dim=bbox.shape[-1], box_dim=bbox.shape[-1],
with_yaw=self.bbox_coder.with_rot, with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 0.5)) origin=(0.5, 0.5, 0.5))
if isinstance(bbox, LiDARInstance3DBoxes): if isinstance(bbox, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
box_idx = bbox.points_in_boxes(points) box_indices = bbox.points_in_boxes_all(points)
box_indices = box_idx.new_zeros([num_bbox + 1])
box_idx[box_idx == -1] = num_bbox
box_indices.scatter_add_(0, box_idx.long(),
box_idx.new_ones(box_idx.shape))
box_indices = box_indices[:-1]
nonempty_box_mask = box_indices >= 0
elif isinstance(bbox, DepthInstance3DBoxes):
box_indices = bbox.points_in_boxes(points)
nonempty_box_mask = box_indices.T.sum(1) >= 0 nonempty_box_mask = box_indices.T.sum(1) >= 0
else: else:
raise NotImplementedError('Unsupported bbox type!') raise NotImplementedError('Unsupported bbox type!')
...@@ -516,20 +505,20 @@ class SSD3DHead(VoteHead): ...@@ -516,20 +505,20 @@ class SSD3DHead(VoteHead):
minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
bbox_classes = torch.argmax(sem_scores, -1) bbox_classes = torch.argmax(sem_scores, -1)
nms_selected = batched_nms( nms_keep = batched_nms(
minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]], minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]],
obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask],
self.test_cfg.nms_cfg)[1] self.test_cfg.nms_cfg)[1]
if nms_selected.shape[0] > self.test_cfg.max_output_num: if nms_keep.shape[0] > self.test_cfg.max_output_num:
nms_selected = nms_selected[:self.test_cfg.max_output_num] nms_keep = nms_keep[:self.test_cfg.max_output_num]
# filter empty boxes and boxes with low score # filter empty boxes and boxes with low score
scores_mask = (obj_scores >= self.test_cfg.score_thr) scores_mask = (obj_scores >= self.test_cfg.score_thr)
nonempty_box_inds = torch.nonzero( nonempty_box_inds = torch.nonzero(
nonempty_box_mask, as_tuple=False).flatten() nonempty_box_mask, as_tuple=False).flatten()
nonempty_mask = torch.zeros_like(bbox_classes).scatter( nonempty_mask = torch.zeros_like(bbox_classes).scatter(
0, nonempty_box_inds[nms_selected], 1) 0, nonempty_box_inds[nms_keep], 1)
selected = (nonempty_mask.bool() & scores_mask.bool()) selected = (nonempty_mask.bool() & scores_mask.bool())
if self.test_cfg.per_class_proposal: if self.test_cfg.per_class_proposal:
...@@ -560,18 +549,8 @@ class SSD3DHead(VoteHead): ...@@ -560,18 +549,8 @@ class SSD3DHead(VoteHead):
tuple[torch.Tensor]: Flags indicating whether each point is tuple[torch.Tensor]: Flags indicating whether each point is
inside bbox and the index of box where each point are in. inside bbox and the index of box where each point are in.
""" """
# TODO: align points_in_boxes function in each box_structures if isinstance(bboxes_3d, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
num_bbox = bboxes_3d.tensor.shape[0] points_mask = bboxes_3d.points_in_boxes_all(points)
if isinstance(bboxes_3d, LiDARInstance3DBoxes):
assignment = bboxes_3d.points_in_boxes(points).long()
points_mask = assignment.new_zeros(
[assignment.shape[0], num_bbox + 1])
assignment[assignment == -1] = num_bbox
points_mask.scatter_(1, assignment.unsqueeze(1), 1)
points_mask = points_mask[:, :-1]
assignment[assignment == num_bbox] = num_bbox - 1
elif isinstance(bboxes_3d, DepthInstance3DBoxes):
points_mask = bboxes_3d.points_in_boxes(points)
assignment = points_mask.argmax(dim=-1) assignment = points_mask.argmax(dim=-1)
else: else:
raise NotImplementedError('Unsupported bbox type!') raise NotImplementedError('Unsupported bbox type!')
......
...@@ -25,7 +25,7 @@ class AnchorTrainMixin(object): ...@@ -25,7 +25,7 @@ class AnchorTrainMixin(object):
gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each image. bboxes of each image.
input_metas (list[dict]): Meta info of each image. input_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (None | list): Ignore list of gt bboxes. gt_bboxes_ignore_list (list): Ignore list of gt bboxes.
gt_labels_list (list[torch.Tensor]): Gt labels of batches. gt_labels_list (list[torch.Tensor]): Gt labels of batches.
label_channels (int): The channel of labels. label_channels (int): The channel of labels.
num_classes (int): The number of classes. num_classes (int): The number of classes.
...@@ -35,7 +35,7 @@ class AnchorTrainMixin(object): ...@@ -35,7 +35,7 @@ class AnchorTrainMixin(object):
tuple (list, list, list, list, list, list, int, int): tuple (list, list, list, list, list, list, int, int):
Anchor targets, including labels, label weights, Anchor targets, including labels, label weights,
bbox targets, bbox weights, direction targets, bbox targets, bbox weights, direction targets,
direction weights, number of postive anchors and direction weights, number of positive anchors and
number of negative anchors. number of negative anchors.
""" """
num_imgs = len(input_metas) num_imgs = len(input_metas)
...@@ -293,6 +293,7 @@ class AnchorTrainMixin(object): ...@@ -293,6 +293,7 @@ class AnchorTrainMixin(object):
sampling_result.pos_bboxes, sampling_result.pos_bboxes,
pos_bbox_targets, pos_bbox_targets,
self.dir_offset, self.dir_offset,
self.dir_limit_offset,
one_hot=False) one_hot=False)
bbox_targets[pos_inds, :] = pos_bbox_targets bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0 bbox_weights[pos_inds, :] = 1.0
...@@ -318,6 +319,7 @@ class AnchorTrainMixin(object): ...@@ -318,6 +319,7 @@ class AnchorTrainMixin(object):
def get_direction_target(anchors, def get_direction_target(anchors,
reg_targets, reg_targets,
dir_offset=0, dir_offset=0,
dir_limit_offset=0,
num_bins=2, num_bins=2,
one_hot=True): one_hot=True):
"""Encode direction to 0 ~ num_bins-1. """Encode direction to 0 ~ num_bins-1.
...@@ -333,7 +335,7 @@ def get_direction_target(anchors, ...@@ -333,7 +335,7 @@ def get_direction_target(anchors,
torch.Tensor: Encoded direction targets. torch.Tensor: Encoded direction targets.
""" """
rot_gt = reg_targets[..., 6] + anchors[..., 6] rot_gt = reg_targets[..., 6] + anchors[..., 6]
offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi) offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi)
dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
if one_hot: if one_hot:
......
...@@ -136,7 +136,7 @@ class VoteHead(BaseModule): ...@@ -136,7 +136,7 @@ class VoteHead(BaseModule):
"""Forward pass. """Forward pass.
Note: Note:
The forward of VoteHead is devided into 4 steps: The forward of VoteHead is divided into 4 steps:
1. Generate vote_points from seed_points. 1. Generate vote_points from seed_points.
2. Aggregate vote_points. 2. Aggregate vote_points.
...@@ -234,15 +234,15 @@ class VoteHead(BaseModule): ...@@ -234,15 +234,15 @@ class VoteHead(BaseModule):
Args: Args:
bbox_preds (dict): Predictions from forward of vote head. bbox_preds (dict): Predictions from forward of vote head.
points (list[torch.Tensor]): Input points. points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each sample. bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise pts_semantic_mask (list[torch.Tensor]): Point-wise
semantic mask. semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise pts_instance_mask (list[torch.Tensor]): Point-wise
instance mask. instance mask.
img_metas (list[dict]): Contain pcd and img's meta info. img_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify gt_bboxes_ignore (list[torch.Tensor]): Specify
which bounding. which bounding.
ret_target (Bool): Return targets or not. ret_target (Bool): Return targets or not.
...@@ -358,12 +358,12 @@ class VoteHead(BaseModule): ...@@ -358,12 +358,12 @@ class VoteHead(BaseModule):
Args: Args:
points (list[torch.Tensor]): Points of each batch. points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \ gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each batch. bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance pts_instance_mask (list[torch.Tensor]): Point-wise instance
label of each batch. label of each batch.
bbox_preds (torch.Tensor): Bounding box predictions of vote head. bbox_preds (torch.Tensor): Bounding box predictions of vote head.
...@@ -447,12 +447,12 @@ class VoteHead(BaseModule): ...@@ -447,12 +447,12 @@ class VoteHead(BaseModule):
Args: Args:
points (torch.Tensor): Points of each batch. points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \ gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
boxes of each batch. boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch. gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic pts_semantic_mask (torch.Tensor): Point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance pts_instance_mask (torch.Tensor): Point-wise instance
label of each batch. label of each batch.
aggregated_points (torch.Tensor): Aggregated points from aggregated_points (torch.Tensor): Aggregated points from
vote aggregation layer. vote aggregation layer.
...@@ -471,7 +471,7 @@ class VoteHead(BaseModule): ...@@ -471,7 +471,7 @@ class VoteHead(BaseModule):
vote_target_masks = points.new_zeros([num_points], vote_target_masks = points.new_zeros([num_points],
dtype=torch.long) dtype=torch.long)
vote_target_idx = points.new_zeros([num_points], dtype=torch.long) vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
box_indices_all = gt_bboxes_3d.points_in_boxes(points) box_indices_all = gt_bboxes_3d.points_in_boxes_all(points)
for i in range(gt_labels_3d.shape[0]): for i in range(gt_labels_3d.shape[0]):
box_indices = box_indices_all[:, i] box_indices = box_indices_all[:, i]
indices = torch.nonzero( indices = torch.nonzero(
...@@ -621,7 +621,7 @@ class VoteHead(BaseModule): ...@@ -621,7 +621,7 @@ class VoteHead(BaseModule):
box_dim=bbox.shape[-1], box_dim=bbox.shape[-1],
with_yaw=self.bbox_coder.with_rot, with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 0.5)) origin=(0.5, 0.5, 0.5))
box_indices = bbox.points_in_boxes(points) box_indices = bbox.points_in_boxes_all(points)
corner3d = bbox.corners corner3d = bbox.corners
minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
......
...@@ -10,7 +10,9 @@ from .imvoxelnet import ImVoxelNet ...@@ -10,7 +10,9 @@ from .imvoxelnet import ImVoxelNet
from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
from .mvx_two_stage import MVXTwoStageDetector from .mvx_two_stage import MVXTwoStageDetector
from .parta2 import PartA2 from .parta2 import PartA2
from .point_rcnn import PointRCNN
from .single_stage_mono3d import SingleStageMono3DDetector from .single_stage_mono3d import SingleStageMono3DDetector
from .smoke_mono3d import SMOKEMono3D
from .ssd3dnet import SSD3DNet from .ssd3dnet import SSD3DNet
from .votenet import VoteNet from .votenet import VoteNet
from .voxelnet import VoxelNet from .voxelnet import VoxelNet
...@@ -19,5 +21,5 @@ __all__ = [ ...@@ -19,5 +21,5 @@ __all__ = [
'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector', 'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet' 'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D'
] ]
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from os import path as osp
import mmcv import mmcv
import torch import torch
from mmcv.parallel import DataContainer as DC from mmcv.parallel import DataContainer as DC
from mmcv.runner import auto_fp16 from mmcv.runner import auto_fp16
from os import path as osp
from mmdet3d.core import Box3DMode, Coord3DMode, show_result from mmdet3d.core import Box3DMode, Coord3DMode, show_result
from mmdet.models.detectors import BaseDetector from mmdet.models.detectors import BaseDetector
...@@ -114,7 +115,7 @@ class Base3DDetector(BaseDetector): ...@@ -114,7 +115,7 @@ class Base3DDetector(BaseDetector):
Box3DMode.DEPTH) Box3DMode.DEPTH)
elif box_mode_3d != Box3DMode.DEPTH: elif box_mode_3d != Box3DMode.DEPTH:
ValueError( ValueError(
f'Unsupported box_mode_3d {box_mode_3d} for convertion!') f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
pred_bboxes = pred_bboxes.tensor.cpu().numpy() pred_bboxes = pred_bboxes.tensor.cpu().numpy()
show_result( show_result(
points, points,
......
...@@ -97,7 +97,8 @@ class CenterPoint(MVXTwoStageDetector): ...@@ -97,7 +97,8 @@ class CenterPoint(MVXTwoStageDetector):
Args: Args:
feats (list[torch.Tensor]): Feature of point cloud. feats (list[torch.Tensor]): Feature of point cloud.
img_metas (list[dict]): Meta information of samples. img_metas (list[dict]): Meta information of samples.
rescale (bool): Whether to rescale bboxes. Default: False. rescale (bool, optional): Whether to rescale bboxes.
Default: False.
Returns: Returns:
dict: Returned bboxes consists of the following keys: dict: Returned bboxes consists of the following keys:
...@@ -121,8 +122,8 @@ class CenterPoint(MVXTwoStageDetector): ...@@ -121,8 +122,8 @@ class CenterPoint(MVXTwoStageDetector):
task_id][0][key][:, 1, ...] task_id][0][key][:, 1, ...]
elif key == 'rot': elif key == 'rot':
outs[task_id][0][ outs[task_id][0][
key][:, 1, key][:, 0,
...] = -outs[task_id][0][key][:, 1, ...] ...] = -outs[task_id][0][key][:, 0, ...]
elif key == 'vel': elif key == 'vel':
outs[task_id][0][ outs[task_id][0][
key][:, 1, key][:, 1,
...@@ -135,8 +136,8 @@ class CenterPoint(MVXTwoStageDetector): ...@@ -135,8 +136,8 @@ class CenterPoint(MVXTwoStageDetector):
task_id][0][key][:, 0, ...] task_id][0][key][:, 0, ...]
elif key == 'rot': elif key == 'rot':
outs[task_id][0][ outs[task_id][0][
key][:, 0, key][:, 1,
...] = -outs[task_id][0][key][:, 0, ...] ...] = -outs[task_id][0][key][:, 1, ...]
elif key == 'vel': elif key == 'vel':
outs[task_id][0][ outs[task_id][0][
key][:, 0, key][:, 0,
......
...@@ -38,11 +38,11 @@ class GroupFree3DNet(SingleStage3DDetector): ...@@ -38,11 +38,11 @@ class GroupFree3DNet(SingleStage3DDetector):
img_metas (list): Image metas. img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic pts_semantic_mask (list[torch.Tensor]): point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance pts_instance_mask (list[torch.Tensor]): point-wise instance
label of each batch. label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify gt_bboxes_ignore (list[torch.Tensor]): Specify
which bounding. which bounding.
Returns: Returns:
......
...@@ -47,11 +47,11 @@ class H3DNet(TwoStage3DDetector): ...@@ -47,11 +47,11 @@ class H3DNet(TwoStage3DDetector):
img_metas (list): Image metas. img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic pts_semantic_mask (list[torch.Tensor]): point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance pts_instance_mask (list[torch.Tensor]): point-wise instance
label of each batch. label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify gt_bboxes_ignore (list[torch.Tensor]): Specify
which bounding. which bounding.
Returns: Returns:
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import warnings
import numpy as np import numpy as np
import torch import torch
import warnings
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet3d.models.utils import MLP from mmdet3d.models.utils import MLP
...@@ -149,21 +150,21 @@ class ImVoteNet(Base3DDetector): ...@@ -149,21 +150,21 @@ class ImVoteNet(Base3DDetector):
if self.with_img_backbone: if self.with_img_backbone:
if img_pretrained is not None: if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \ warnings.warn('DeprecationWarning: pretrained is a deprecated '
key, please consider using init_cfg') 'key, please consider using init_cfg.')
self.img_backbone.init_cfg = dict( self.img_backbone.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained) type='Pretrained', checkpoint=img_pretrained)
if self.with_img_roi_head: if self.with_img_roi_head:
if img_pretrained is not None: if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \ warnings.warn('DeprecationWarning: pretrained is a deprecated '
key, please consider using init_cfg') 'key, please consider using init_cfg.')
self.img_roi_head.init_cfg = dict( self.img_roi_head.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained) type='Pretrained', checkpoint=img_pretrained)
if self.with_pts_backbone: if self.with_pts_backbone:
if img_pretrained is not None: if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \ warnings.warn('DeprecationWarning: pretrained is a deprecated '
key, please consider using init_cfg') 'key, please consider using init_cfg.')
self.pts_backbone.init_cfg = dict( self.pts_backbone.init_cfg = dict(
type='Pretrained', checkpoint=pts_pretrained) type='Pretrained', checkpoint=pts_pretrained)
...@@ -393,9 +394,9 @@ class ImVoteNet(Base3DDetector): ...@@ -393,9 +394,9 @@ class ImVoteNet(Base3DDetector):
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[torch.Tensor]): class indices for each gt_labels (list[torch.Tensor]): class indices for each
2d bounding box. 2d bounding box.
gt_bboxes_ignore (None | list[torch.Tensor]): specify which gt_bboxes_ignore (list[torch.Tensor]): specify which
2d bounding boxes can be ignored when computing the loss. 2d bounding boxes can be ignored when computing the loss.
gt_masks (None | torch.Tensor): true segmentation masks for each gt_masks (torch.Tensor): true segmentation masks for each
2d bbox, used if the architecture supports a segmentation task. 2d bbox, used if the architecture supports a segmentation task.
proposals: override rpn proposals (2d) with custom proposals. proposals: override rpn proposals (2d) with custom proposals.
Use when `with_rpn` is False. Use when `with_rpn` is False.
...@@ -403,9 +404,9 @@ class ImVoteNet(Base3DDetector): ...@@ -403,9 +404,9 @@ class ImVoteNet(Base3DDetector):
not supported yet. not supported yet.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes. gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic pts_semantic_mask (list[torch.Tensor]): point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance pts_instance_mask (list[torch.Tensor]): point-wise instance
label of each batch. label of each batch.
Returns: Returns:
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import warnings
from os import path as osp
import mmcv import mmcv
import torch import torch
import warnings
from mmcv.parallel import DataContainer as DC from mmcv.parallel import DataContainer as DC
from mmcv.runner import force_fp32 from mmcv.runner import force_fp32
from os import path as osp
from torch.nn import functional as F from torch.nn import functional as F
from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
...@@ -84,21 +85,20 @@ class MVXTwoStageDetector(Base3DDetector): ...@@ -84,21 +85,20 @@ class MVXTwoStageDetector(Base3DDetector):
if self.with_img_backbone: if self.with_img_backbone:
if img_pretrained is not None: if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \ warnings.warn('DeprecationWarning: pretrained is a deprecated '
key, please consider using init_cfg') 'key, please consider using init_cfg.')
self.img_backbone.init_cfg = dict( self.img_backbone.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained) type='Pretrained', checkpoint=img_pretrained)
if self.with_img_roi_head: if self.with_img_roi_head:
if img_pretrained is not None: if img_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \ warnings.warn('DeprecationWarning: pretrained is a deprecated '
key, please consider using init_cfg') 'key, please consider using init_cfg.')
self.img_roi_head.init_cfg = dict( self.img_roi_head.init_cfg = dict(
type='Pretrained', checkpoint=img_pretrained) type='Pretrained', checkpoint=img_pretrained)
if self.with_pts_backbone: if self.with_pts_backbone:
if pts_pretrained is not None: if pts_pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \ warnings.warn('DeprecationWarning: pretrained is a deprecated '
key, please consider using init_cfg') 'key, please consider using init_cfg')
self.pts_backbone.init_cfg = dict( self.pts_backbone.init_cfg = dict(
type='Pretrained', checkpoint=pts_pretrained) type='Pretrained', checkpoint=pts_pretrained)
...@@ -260,7 +260,7 @@ class MVXTwoStageDetector(Base3DDetector): ...@@ -260,7 +260,7 @@ class MVXTwoStageDetector(Base3DDetector):
of 2D boxes in images. Defaults to None. of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None. images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape img (torch.Tensor, optional): Images of each sample with shape
(N, C, H, W). Defaults to None. (N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None. used for training Fast RCNN. Defaults to None.
...@@ -497,7 +497,7 @@ class MVXTwoStageDetector(Base3DDetector): ...@@ -497,7 +497,7 @@ class MVXTwoStageDetector(Base3DDetector):
Box3DMode.DEPTH) Box3DMode.DEPTH)
elif box_mode_3d != Box3DMode.DEPTH: elif box_mode_3d != Box3DMode.DEPTH:
ValueError( ValueError(
f'Unsupported box_mode_3d {box_mode_3d} for convertion!') f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
pred_bboxes = pred_bboxes.tensor.cpu().numpy() pred_bboxes = pred_bboxes.tensor.cpu().numpy()
show_result(points, None, pred_bboxes, out_dir, file_name) show_result(points, None, pred_bboxes, out_dir, file_name)
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmdet.models import DETECTORS
from .two_stage import TwoStage3DDetector
@DETECTORS.register_module()
class PointRCNN(TwoStage3DDetector):
r"""PointRCNN detector.
Please refer to the `PointRCNN <https://arxiv.org/abs/1812.04244>`_
Args:
backbone (dict): Config dict of detector's backbone.
neck (dict, optional): Config dict of neck. Defaults to None.
rpn_head (dict, optional): Config of RPN head. Defaults to None.
roi_head (dict, optional): Config of ROI head. Defaults to None.
train_cfg (dict, optional): Train configs. Defaults to None.
test_cfg (dict, optional): Test configs. Defaults to None.
pretrained (str, optional): Model pretrained path. Defaults to None.
init_cfg (dict, optional): Config of initialization. Defaults to None.
"""
def __init__(self,
backbone,
neck=None,
rpn_head=None,
roi_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
init_cfg=None):
super(PointRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained,
init_cfg=init_cfg)
def extract_feat(self, points):
"""Directly extract features from the backbone+neck.
Args:
points (torch.Tensor): Input points.
Returns:
dict: Features from the backbone+neck
"""
x = self.backbone(points)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d):
"""Forward of training.
Args:
points (list[torch.Tensor]): Points of each batch.
img_metas (list[dict]): Meta information of each sample.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
Returns:
dict: Losses.
"""
losses = dict()
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
# features for rcnn
backbone_feats = x['fp_features'].clone()
backbone_xyz = x['fp_xyz'].clone()
rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
bbox_preds, cls_preds = self.rpn_head(x)
rpn_loss = self.rpn_head.loss(
bbox_preds=bbox_preds,
cls_preds=cls_preds,
points=points,
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
img_metas=img_metas)
losses.update(rpn_loss)
bbox_list = self.rpn_head.get_bboxes(points_cat, bbox_preds, cls_preds,
img_metas)
proposal_list = [
dict(
boxes_3d=bboxes,
scores_3d=scores,
labels_3d=labels,
cls_preds=preds_cls)
for bboxes, scores, labels, preds_cls in bbox_list
]
rcnn_feats.update({'points_cls_preds': cls_preds})
roi_losses = self.roi_head.forward_train(rcnn_feats, img_metas,
proposal_list, gt_bboxes_3d,
gt_labels_3d)
losses.update(roi_losses)
return losses
def simple_test(self, points, img_metas, imgs=None, rescale=False):
"""Forward of testing.
Args:
points (list[torch.Tensor]): Points of each sample.
img_metas (list[dict]): Image metas.
imgs (list[torch.Tensor], optional): Images of each sample.
Defaults to None.
rescale (bool, optional): Whether to rescale results.
Defaults to False.
Returns:
list: Predicted 3d boxes.
"""
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
# features for rcnn
backbone_feats = x['fp_features'].clone()
backbone_xyz = x['fp_xyz'].clone()
rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
bbox_preds, cls_preds = self.rpn_head(x)
rcnn_feats.update({'points_cls_preds': cls_preds})
bbox_list = self.rpn_head.get_bboxes(
points_cat, bbox_preds, cls_preds, img_metas, rescale=rescale)
proposal_list = [
dict(
boxes_3d=bboxes,
scores_3d=scores,
labels_3d=labels,
cls_preds=preds_cls)
for bboxes, scores, labels, preds_cls in bbox_list
]
bbox_results = self.roi_head.simple_test(rcnn_feats, img_metas,
proposal_list)
return bbox_results
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from os import path as osp
import mmcv import mmcv
import numpy as np import numpy as np
import torch import torch
from mmcv.parallel import DataContainer as DC from mmcv.parallel import DataContainer as DC
from os import path as osp
from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result, from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result,
show_multi_modality_result) show_multi_modality_result)
...@@ -48,14 +49,15 @@ class SingleStageMono3DDetector(SingleStageDetector): ...@@ -48,14 +49,15 @@ class SingleStageMono3DDetector(SingleStageDetector):
image in [tl_x, tl_y, br_x, br_y] format. image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box gt_labels (list[Tensor]): Class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for
each image in [x, y, z, w, l, h, theta, vx, vy] format. each image in [x, y, z, x_size, y_size, z_size, yaw, vx, vy]
format.
gt_labels_3d (list[Tensor]): 3D class indices corresponding to gt_labels_3d (list[Tensor]): 3D class indices corresponding to
each box. each box.
centers2d (list[Tensor]): Projected 3D centers onto 2D images. centers2d (list[Tensor]): Projected 3D centers onto 2D images.
depths (list[Tensor]): Depth of projected centers on 2D images. depths (list[Tensor]): Depth of projected centers on 2D images.
attr_labels (list[Tensor], optional): Attribute indices attr_labels (list[Tensor], optional): Attribute indices
corresponding to each box corresponding to each box
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding gt_bboxes_ignore (list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss. boxes can be ignored when computing the loss.
Returns: Returns:
......
# Copyright (c) OpenMMLab. All rights reserved.
from mmdet.models.builder import DETECTORS
from .single_stage_mono3d import SingleStageMono3DDetector
@DETECTORS.register_module()
class SMOKEMono3D(SingleStageMono3DDetector):
r"""SMOKE <https://arxiv.org/abs/2002.10111>`_ for monocular 3D object
detection.
"""
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(SMOKEMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
...@@ -40,11 +40,11 @@ class VoteNet(SingleStage3DDetector): ...@@ -40,11 +40,11 @@ class VoteNet(SingleStage3DDetector):
img_metas (list): Image metas. img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic pts_semantic_mask (list[torch.Tensor]): point-wise semantic
label of each batch. label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance pts_instance_mask (list[torch.Tensor]): point-wise instance
label of each batch. label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify gt_bboxes_ignore (list[torch.Tensor]): Specify
which bounding. which bounding.
Returns: Returns:
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import torch
from functools import partial from functools import partial
import torch
from mmdet3d.core.points import get_points_type from mmdet3d.core.points import get_points_type
......
...@@ -32,9 +32,9 @@ def point_sample(img_meta, ...@@ -32,9 +32,9 @@ def point_sample(img_meta,
points (torch.Tensor): Nx3 point cloud in LiDAR coordinates. points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
proj_mat (torch.Tensor): 4x4 transformation matrix. proj_mat (torch.Tensor): 4x4 transformation matrix.
coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
img_scale_factor (torch.Tensor): Scale factor with shape of \ img_scale_factor (torch.Tensor): Scale factor with shape of
(w_scale, h_scale). (w_scale, h_scale).
img_crop_offset (torch.Tensor): Crop offset used to crop \ img_crop_offset (torch.Tensor): Crop offset used to crop
image during data augmentation with shape of (w_offset, h_offset). image during data augmentation with shape of (w_offset, h_offset).
img_flip (bool): Whether the image is flipped. img_flip (bool): Whether the image is flipped.
img_pad_shape (tuple[int]): int tuple indicates the h & w after img_pad_shape (tuple[int]): int tuple indicates the h & w after
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment