"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "e013bab5674e8d35d1998a050e1fa239ac9a747d"
Unverified Commit 8538177b authored by ChaimZhu's avatar ChaimZhu Committed by GitHub
Browse files

[Feature] Add MonoFlex Head (#1044)

parent 4590418e
...@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder): ...@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder):
torch.Tensor: Targets of orientations. torch.Tensor: Targets of orientations.
""" """
local_yaw = gt_bboxes_3d.local_yaw local_yaw = gt_bboxes_3d.local_yaw
# encode local yaw (-pi ~ pi) to multibin format # encode local yaw (-pi ~ pi) to multibin format
encode_local_yaw = np.zeros(self.num_dir_bins * 2) encode_local_yaw = local_yaw.new_zeros(
[local_yaw.shape[0], self.num_dir_bins * 2])
bin_size = 2 * np.pi / self.num_dir_bins bin_size = 2 * np.pi / self.num_dir_bins
margin_size = bin_size * self.bin_margin margin_size = bin_size * self.bin_margin
bin_centers = self.bin_centers bin_centers = local_yaw.new_tensor(self.bin_centers)
range_size = bin_size / 2 + margin_size range_size = bin_size / 2 + margin_size
offsets = local_yaw - bin_centers.unsqueeze(0) offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0)
offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi
offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi
...@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder): ...@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder):
offset = offsets[:, i] offset = offsets[:, i]
inds = abs(offset) < range_size inds = abs(offset) < range_size
encode_local_yaw[inds, i] = 1 encode_local_yaw[inds, i] = 1
encode_local_yaw[inds, i + self.num_dir_bins] = offset encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds]
orientation_target = encode_local_yaw orientation_target = encode_local_yaw
...@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder): ...@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder):
pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1) pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1)
# 2 dimension of offsets x keypoints (8 corners + top/bottom center) # 2 dimension of offsets x keypoints (8 corners + top/bottom center)
pred_keypoints2d = bbox[:, 6:26] pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2)
# 1 dimension for depth offsets # 1 dimension for depth offsets
pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1) pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1)
...@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder): ...@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder):
raise NotImplementedError raise NotImplementedError
# (N, 3) # (N, 3)
centers2d_img = \ centers2d_img = \
torch.cat(centers2d_img, depths.unsqueeze(-1), dim=1) torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1)
# (N, 4, 1) # (N, 4, 1)
centers2d_extend = \ centers2d_extend = \
torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)), torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)),
dim=1).unqueeze(-1) dim=1).unsqueeze(-1)
locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1) locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1)
return locations[:, :3] return locations[:, :3]
...@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder): ...@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder):
local_yaws = orientations local_yaws = orientations
yaws = local_yaws + rays yaws = local_yaws + rays
larger_idx = (yaws > np.pi).nonzero() larger_idx = (yaws > np.pi).nonzero(as_tuple=False)
small_idx = (yaws < -np.pi).nonzero() small_idx = (yaws < -np.pi).nonzero(as_tuple=False)
if len(larger_idx) != 0: if len(larger_idx) != 0:
yaws[larger_idx] -= 2 * np.pi yaws[larger_idx] -= 2 * np.pi
if len(small_idx) != 0: if len(small_idx) != 0:
yaws[small_idx] += 2 * np.pi yaws[small_idx] += 2 * np.pi
larger_idx = (local_yaws > np.pi).nonzero() larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False)
small_idx = (local_yaws < -np.pi).nonzero() small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False)
if len(larger_idx) != 0: if len(larger_idx) != 0:
local_yaws[larger_idx] -= 2 * np.pi local_yaws[larger_idx] -= 2 * np.pi
if len(small_idx) != 0: if len(small_idx) != 0:
...@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder): ...@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder):
return bboxes2d return bboxes2d
def combine_depths(depth, depth_uncertainty): def combine_depths(self, depth, depth_uncertainty):
"""Combine all the prediced depths with depth uncertainty. """Combine all the prediced depths with depth uncertainty.
Args: Args:
......
...@@ -324,8 +324,11 @@ def yaw2local(yaw, loc): ...@@ -324,8 +324,11 @@ def yaw2local(yaw, loc):
torch.Tensor: local yaw (alpha in kitti). torch.Tensor: local yaw (alpha in kitti).
""" """
local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2]) local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
while local_yaw > np.pi: larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
local_yaw -= np.pi * 2 small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
while local_yaw < -np.pi: if len(larger_idx) != 0:
local_yaw += np.pi * 2 local_yaw[larger_idx] -= 2 * np.pi
if len(small_idx) != 0:
local_yaw[small_idx] += 2 * np.pi
return local_yaw return local_yaw
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from .array_converter import ArrayConverter, array_converter from .array_converter import ArrayConverter, array_converter
from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d,
gaussian_radius, get_ellip_gaussian_2D)
__all__ = [ __all__ = [
'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian', 'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian',
'ArrayConverter', 'array_converter' 'ArrayConverter', 'array_converter', 'ellip_gaussian2D',
'get_ellip_gaussian_2D'
] ]
...@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5): ...@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5):
sq3 = torch.sqrt(b3**2 - 4 * a3 * c3) sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
r3 = (b3 + sq3) / 2 r3 = (b3 + sq3) / 2
return min(r1, r2, r3) return min(r1, r2, r3)
def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1):
"""Generate 2D ellipse gaussian heatmap.
Args:
heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
it and maintain the max value.
center (list[int]): Coord of gaussian kernel's center.
radius_x (int): X-axis radius of gaussian kernel.
radius_y (int): Y-axis radius of gaussian kernel.
k (int, optional): Coefficient of gaussian kernel. Default: 1.
Returns:
out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
"""
diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1
gaussian_kernel = ellip_gaussian2D((radius_x, radius_y),
sigma_x=diameter_x / 6,
sigma_y=diameter_y / 6,
dtype=heatmap.dtype,
device=heatmap.device)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[0:2]
left, right = min(x, radius_x), min(width - x, radius_x + 1)
top, bottom = min(y, radius_y), min(height - y, radius_y + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom,
radius_x - left:radius_x + right]
out_heatmap = heatmap
torch.max(
masked_heatmap,
masked_gaussian * k,
out=out_heatmap[y - top:y + bottom, x - left:x + right])
return out_heatmap
def ellip_gaussian2D(radius,
sigma_x,
sigma_y,
dtype=torch.float32,
device='cpu'):
"""Generate 2D ellipse gaussian kernel.
Args:
radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
kernel.
sigma_x (int): X-axis sigma of gaussian function.
sigma_y (int): Y-axis sigma of gaussian function.
dtype (torch.dtype, optional): Dtype of gaussian tensor.
Default: torch.float32.
device (str, optional): Device of gaussian tensor.
Default: 'cpu'.
Returns:
h (Tensor): Gaussian kernel with a
``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
"""
x = torch.arange(
-radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1)
y = torch.arange(
-radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1)
h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) /
(2 * sigma_y * sigma_y)).exp()
h[h < torch.finfo(h.dtype).eps * h.max()] = 0
return h
...@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead ...@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead
from .fcos_mono3d_head import FCOSMono3DHead from .fcos_mono3d_head import FCOSMono3DHead
from .free_anchor3d_head import FreeAnchor3DHead from .free_anchor3d_head import FreeAnchor3DHead
from .groupfree3d_head import GroupFree3DHead from .groupfree3d_head import GroupFree3DHead
from .monoflex_head import MonoFlexHead
from .parta2_rpn_head import PartA2RPNHead from .parta2_rpn_head import PartA2RPNHead
from .pgd_head import PGDHead from .pgd_head import PGDHead
from .point_rpn_head import PointRPNHead from .point_rpn_head import PointRPNHead
...@@ -19,5 +20,6 @@ __all__ = [ ...@@ -19,5 +20,6 @@ __all__ = [
'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead', 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead' 'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
'MonoFlexHead'
] ]
import torch
from mmcv.cnn import xavier_init
from torch import nn as nn
from mmdet3d.core.utils import get_ellip_gaussian_2D
from mmdet3d.models.model_utils import EdgeFusionModule
from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
get_keypoints, handle_proj_objs)
from mmdet.core import multi_apply
from mmdet.core.bbox.builder import build_bbox_coder
from mmdet.models.builder import HEADS, build_loss
from mmdet.models.utils import gaussian_radius, gen_gaussian_target
from mmdet.models.utils.gaussian_target import (get_local_maximum,
get_topk_from_heatmap,
transpose_and_gather_feat)
from .anchor_free_mono3d_head import AnchorFreeMono3DHead
@HEADS.register_module()
class MonoFlexHead(AnchorFreeMono3DHead):
r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
.. code-block:: none
/ --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
|
| --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
|
| --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty
feature
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty
|
| --> 3 x 3 conv --> 1 x 1 conv --> 3d dimensions
|
| |--- 1 x 1 conv --> ori cls
| --> 3 x 3 conv --|
| |--- 1 x 1 conv --> ori offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> depth
|
\ --> 3 x 3 conv --> 1 x 1 conv --> depth uncertainty
Args:
use_edge_fusion (bool): Whether to use edge fusion module while
feature extraction.
edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
edge_heatmap_ratio (float): Ratio of generating target heatmap.
filter_outside_objs (bool, optional): Whether to filter the
outside objects. Default: True.
loss_cls (dict, optional): Config of classification loss.
Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
loss_bbox (dict, optional): Config of localization loss.
Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
loss_dir (dict, optional): Config of direction classification loss.
Default: dict(type='MultibinLoss', loss_weight=0.1).
loss_keypoints (dict, optional): Config of keypoints loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_dims: (dict, optional): Config of dimensions loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_offsets2d: (dict, optional): Config of offsets2d loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_direct_depth: (dict, optional): Config of directly regression depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_combined_depth: (dict, optional): Config of combined depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_attr (dict, optional): Config of attribute classification loss.
In MonoFlex, Default: None.
bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
Default: dict(type='MonoFlexCoder', code_size=7).
norm_cfg (dict, optional): Dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
init_cfg (dict): Initialization config dict. Default: None.
""" # noqa: E501
def __init__(self,
num_classes,
in_channels,
use_edge_fusion,
edge_fusion_inds,
edge_heatmap_ratio,
filter_outside_objs=True,
loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
loss_bbox=dict(type='IoULoss', loss_weight=0.1),
loss_dir=dict(type='MultiBinLoss', loss_weight=0.1),
loss_keypoints=dict(type='L1Loss', loss_weight=0.1),
loss_dims=dict(type='L1Loss', loss_weight=0.1),
loss_offsets2d=dict(type='L1Loss', loss_weight=0.1),
loss_direct_depth=dict(type='L1Loss', loss_weight=0.1),
loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1),
loss_combined_depth=dict(type='L1Loss', loss_weight=0.1),
loss_attr=None,
bbox_coder=dict(type='MonoFlexCoder', code_size=7),
norm_cfg=dict(type='BN'),
init_cfg=None,
init_bias=-2.19,
**kwargs):
self.use_edge_fusion = use_edge_fusion
self.edge_fusion_inds = edge_fusion_inds
super().__init__(
num_classes,
in_channels,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_dir=loss_dir,
loss_attr=loss_attr,
norm_cfg=norm_cfg,
init_cfg=init_cfg,
**kwargs)
self.filter_outside_objs = filter_outside_objs
self.edge_heatmap_ratio = edge_heatmap_ratio
self.init_bias = init_bias
self.loss_dir = build_loss(loss_dir)
self.loss_keypoints = build_loss(loss_keypoints)
self.loss_dims = build_loss(loss_dims)
self.loss_offsets2d = build_loss(loss_offsets2d)
self.loss_direct_depth = build_loss(loss_direct_depth)
self.loss_keypoints_depth = build_loss(loss_keypoints_depth)
self.loss_combined_depth = build_loss(loss_combined_depth)
self.bbox_coder = build_bbox_coder(bbox_coder)
def _init_edge_module(self):
"""Initialize edge fusion module for feature extraction."""
self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256)
for i in range(len(self.edge_fusion_inds)):
reg_inds, out_inds = self.edge_fusion_inds[i]
out_channels = self.group_reg_dims[reg_inds][out_inds]
fusion_layer = EdgeFusionModule(out_channels, 256)
layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}'
self.add_module(layer_name, fusion_layer)
def init_weights(self):
"""Initialize weights."""
super().init_weights()
self.conv_cls.bias.data.fill_(self.init_bias)
xavier_init(self.conv_regs[4][0], gain=0.01)
xavier_init(self.conv_regs[7][0], gain=0.01)
for m in self.conv_regs.modules():
if isinstance(m, nn.Conv2d):
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def _init_predictor(self):
"""Initialize predictor layers of the head."""
self.conv_cls_prev = self._init_branch(
conv_channels=self.cls_branch,
conv_strides=(1, ) * len(self.cls_branch))
self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
1)
# init regression head
self.conv_reg_prevs = nn.ModuleList()
# init output head
self.conv_regs = nn.ModuleList()
# group_reg_dims:
# ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
for i in range(len(self.group_reg_dims)):
reg_dims = self.group_reg_dims[i]
reg_branch_channels = self.reg_branch[i]
out_channel = self.out_channels[i]
reg_list = nn.ModuleList()
if len(reg_branch_channels) > 0:
self.conv_reg_prevs.append(
self._init_branch(
conv_channels=reg_branch_channels,
conv_strides=(1, ) * len(reg_branch_channels)))
for reg_dim in reg_dims:
reg_list.append(nn.Conv2d(out_channel, reg_dim, 1))
self.conv_regs.append(reg_list)
else:
self.conv_reg_prevs.append(None)
for reg_dim in reg_dims:
reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1))
self.conv_regs.append(reg_list)
def _init_layers(self):
"""Initialize layers of the head."""
self._init_predictor()
if self.use_edge_fusion:
self._init_edge_module()
def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels,
gt_bboxes_ignore, proposal_cfg, **kwargs):
"""
Args:
x (list[Tensor]): Features from FPN.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
shape (num_gts, self.bbox_code_size).
gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
shape (num_gts,).
centers2d (list[Tensor]): Projected 3D center of each box,
shape (num_gts, 2).
depths (list[Tensor]): Depth of projected 3D center of each box,
shape (num_gts,).
attr_labels (list[Tensor]): Attribute labels of each box,
shape (num_gts,).
gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
Returns:
tuple:
losses: (dict[str, Tensor]): A dictionary of loss components.
proposal_list (list[Tensor]): Proposals of each image.
"""
outs = self(x, input_metas)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
attr_labels, input_metas)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels,
input_metas)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
if proposal_cfg is None:
return losses
else:
proposal_list = self.get_bboxes(
*outs, input_metas, cfg=proposal_cfg)
return losses, proposal_list
def forward(self, feats, input_metas):
"""Forward features from the upstream network.
Args:
feats (list[Tensor]): Features from the upstream network, each is
a 4D-tensor.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
"""
mlvl_input_metas = [input_metas for i in range(len(feats))]
return multi_apply(self.forward_single, feats, mlvl_input_metas)
def forward_single(self, x, input_metas):
"""Forward features of a single scale level.
Args:
x (Tensor): Feature maps from a specific FPN feature level.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple: Scores for each class, bbox predictions.
"""
img_h, img_w = input_metas[0]['pad_shape'][:2]
batch_size, _, feat_h, feat_w = x.shape
downsample_ratio = img_h / feat_h
for conv_cls_prev_layer in self.conv_cls_prev:
cls_feat = conv_cls_prev_layer(x)
out_cls = self.conv_cls(cls_feat)
if self.use_edge_fusion:
# calculate the edge indices for the batch data
edge_indices_list = get_edge_indices(
input_metas, downsample_ratio, device=x.device)
edge_lens = [
edge_indices.shape[0] for edge_indices in edge_indices_list
]
max_edge_len = max(edge_lens)
edge_indices = x.new_zeros((batch_size, max_edge_len, 2),
dtype=torch.long)
for i in range(batch_size):
edge_indices[i, :edge_lens[i]] = edge_indices_list[i]
# cls feature map edge fusion
out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices,
edge_lens, feat_h, feat_w)
bbox_pred = []
for i in range(len(self.group_reg_dims)):
reg_feat = x.clone()
# feature regression head
if len(self.reg_branch[i]) > 0:
for conv_reg_prev_layer in self.conv_reg_prevs[i]:
reg_feat = conv_reg_prev_layer(reg_feat)
for j, conv_reg in enumerate(self.conv_regs[i]):
out_reg = conv_reg(reg_feat)
# Use Edge Fusion Module
if self.use_edge_fusion and (i, j) in self.edge_fusion_inds:
# reg feature map edge fusion
out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format(
i, j))(reg_feat, out_reg, edge_indices, edge_lens,
feat_h, feat_w)
bbox_pred.append(out_reg)
bbox_pred = torch.cat(bbox_pred, dim=1)
cls_score = out_cls.sigmoid() # turn to 0-1
cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
return cls_score, bbox_pred
def get_bboxes(self, cls_scores, bbox_preds, input_metas):
"""Generate bboxes from bbox head predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
bbox_preds (list[Tensor]): Box regression for each scale.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If True, return boxes in original image space.
Returns:
list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
Each item in result_list is 4-tuple.
"""
assert len(cls_scores) == len(bbox_preds) == 1
cam2imgs = torch.stack([
cls_scores[0].new_tensor(input_meta['cam2img'])
for input_meta in input_metas
])
batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
cls_scores[0],
bbox_preds[0],
input_metas,
cam2imgs=cam2imgs,
topk=100,
kernel=3)
result_list = []
for img_id in range(len(input_metas)):
bboxes = batch_bboxes[img_id]
scores = batch_scores[img_id]
labels = batch_topk_labels[img_id]
keep_idx = scores > 0.25
bboxes = bboxes[keep_idx]
scores = scores[keep_idx]
labels = labels[keep_idx]
bboxes = input_metas[img_id]['box_type_3d'](
bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
attrs = None
result_list.append((bboxes, scores, labels, attrs))
return result_list
def decode_heatmap(self,
cls_score,
reg_pred,
input_metas,
cam2imgs,
topk=100,
kernel=3):
"""Transform outputs into detections raw bbox predictions.
Args:
class_score (Tensor): Center predict heatmap,
shape (B, num_classes, H, W).
reg_pred (Tensor): Box regression map.
shape (B, channel, H , W).
input_metas (List[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cam2imgs (Tensor): Camera intrinsic matrix.
shape (N, 4, 4)
topk (int, optional): Get top k center keypoints from heatmap.
Default 100.
kernel (int, optional): Max pooling kernel for extract local
maximum pixels. Default 3.
Returns:
tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
the following Tensors:
- batch_bboxes (Tensor): Coords of each 3D box.
shape (B, k, 7)
- batch_scores (Tensor): Scores of each 3D box.
shape (B, k)
- batch_topk_labels (Tensor): Categories of each 3D box.
shape (B, k)
"""
img_h, img_w = input_metas[0]['pad_shape'][:2]
batch_size, _, feat_h, feat_w = cls_score.shape
downsample_ratio = img_h / feat_h
center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
*batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
center_heatmap_pred, k=topk)
batch_scores, batch_index, batch_topk_labels = batch_dets
regression = transpose_and_gather_feat(reg_pred, batch_index)
regression = regression.view(-1, 8)
pred_base_centers2d = torch.cat(
[topk_xs.view(-1, 1),
topk_ys.view(-1, 1).float()], dim=1)
preds = self.bbox_coder.decode(regression, batch_topk_labels,
downsample_ratio, cam2imgs)
pred_locations = self.bbox_coder.decode_location(
pred_base_centers2d, preds['offsets2d'], preds['combined_depth'],
cam2imgs, downsample_ratio)
pred_yaws = self.bbox_coder.decode_orientation(
preds['orientations']).unsqueeze(-1)
pred_dims = preds['dimensions']
batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1)
batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size)
return batch_bboxes, batch_scores, batch_topk_labels
def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask,
batch_indices, input_metas, downsample_ratio):
"""Prepare predictions for computing loss.
Args:
pred_reg (Tensor): Box regression map.
shape (B, channel, H , W).
labels3d (Tensor): Labels of each 3D box.
shape (B * max_objs, )
centers2d (Tensor): Coords of each projected 3D box
center on image. shape (N, 2)
reg_mask (Tensor): Indexes of the existence of the 3D box.
shape (B * max_objs, )
batch_indices (Tenosr): Batch indices of the 3D box.
shape (N, 3)
input_metas (list[dict]): Meta information of each image,
e.g., image size, scaling factor, etc.
downsample_ratio (int): The stride of feature map.
Returns:
dict: The predictions for computing loss.
"""
batch, channel = pred_reg.shape[0], pred_reg.shape[1]
w = pred_reg.shape[3]
cam2imgs = torch.stack([
centers2d.new_tensor(input_meta['cam2img'])
for input_meta in input_metas
])
# (batch_size, 4, 4) -> (N, 4, 4)
cam2imgs = cam2imgs[batch_indices, :, :]
centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
centers2d_inds = centers2d_inds.view(batch, -1)
pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
pred_regression_pois = pred_regression.view(-1, channel)[reg_mask]
preds = self.bbox_coder.decode(pred_regression_pois, labels3d,
downsample_ratio, cam2imgs)
return preds
def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
gt_labels_3d_list, centers2d_list, depths_list, feat_shape,
img_shape, input_metas):
"""Get training targets for batch images.
``
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
image, shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each
box, shape (num_gt,).
gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
Ground truth bboxes of each image,
shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
each box, shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D
image, shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
feat_shape (tuple[int]): Feature map shape with value,
shape (B, _, H, W).
img_shape (tuple[int]): Image shape in [h, w] format.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple[Tensor, dict]: The Tensor value is the targets of
center heatmap, the dict has components below:
- base_centers2d_target (Tensor): Coords of each projected 3D box
center on image. shape (B * max_objs, 2), [dtype: int]
- labels3d (Tensor): Labels of each 3D box.
shape (N, )
- reg_mask (Tensor): Mask of the existence of the 3D box.
shape (B * max_objs, )
- batch_indices (Tensor): Batch id of the 3D box.
shape (N, )
- depth_target (Tensor): Depth target of each 3D box.
shape (N, )
- keypoints2d_target (Tensor): Keypoints of each projected 3D box
on image. shape (N, 10, 2)
- keypoints_mask (Tensor): Keypoints mask of each projected 3D
box on image. shape (N, 10)
- keypoints_depth_mask (Tensor): Depths decoded from keypoints
of each 3D box. shape (N, 3)
- orientations_target (Tensor): Orientation (encoded local yaw)
target of each 3D box. shape (N, )
- offsets2d_target (Tensor): Offsets target of each projected
3D box. shape (N, 2)
- dimensions_target (Tensor): Dimensions target of each 3D box.
shape (N, 3)
- downsample_ratio (int): The stride of feature map.
"""
img_h, img_w = img_shape[:2]
batch_size, _, feat_h, feat_w = feat_shape
width_ratio = float(feat_w / img_w) # 1/4
height_ratio = float(feat_h / img_h) # 1/4
assert width_ratio == height_ratio
# Whether to filter the objects which are not in FOV.
if self.filter_outside_objs:
filter_outside_objs(gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list,
centers2d_list, input_metas)
# transform centers2d to base centers2d for regression and
# heatmap generation.
# centers2d = int(base_centers2d) + offsets2d
base_centers2d_list, offsets2d_list, trunc_mask_list = \
handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas)
keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \
get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas)
center_heatmap_target = gt_bboxes_list[-1].new_zeros(
[batch_size, self.num_classes, feat_h, feat_w])
for batch_id in range(batch_size):
# project gt_bboxes from input image to feat map
gt_bboxes = gt_bboxes_list[batch_id] * width_ratio
gt_labels = gt_labels_list[batch_id]
# project base centers2d from input image to feat map
gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio
trunc_masks = trunc_mask_list[batch_id]
for j, base_center2d in enumerate(gt_base_centers2d):
if trunc_masks[j]:
# for outside objects, generate ellipse heatmap
base_center2d_x_int, base_center2d_y_int = \
base_center2d.int()
scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0],
gt_bboxes[j][2] - base_center2d_x_int)
scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1],
gt_bboxes[j][3] - base_center2d_y_int)
radius_x = scale_box_w * self.edge_heatmap_ratio
radius_y = scale_box_h * self.edge_heatmap_ratio
radius_x, radius_y = max(0, int(radius_x)), max(
0, int(radius_y))
assert min(radius_x, radius_y) == 0
ind = gt_labels[j]
get_ellip_gaussian_2D(
center_heatmap_target[batch_id, ind],
[base_center2d_x_int, base_center2d_y_int], radius_x,
radius_y)
else:
base_center2d_x_int, base_center2d_y_int = \
base_center2d.int()
scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1])
scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0])
radius = gaussian_radius([scale_box_h, scale_box_w],
min_overlap=0.7)
radius = max(0, int(radius))
ind = gt_labels[j]
gen_gaussian_target(
center_heatmap_target[batch_id, ind],
[base_center2d_x_int, base_center2d_y_int], radius)
avg_factor = max(1, center_heatmap_target.eq(1).sum())
num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list]
max_objs = max(num_ctrs)
batch_indices = [
centers2d_list[0].new_full((num_ctrs[i], ), i)
for i in range(batch_size)
]
batch_indices = torch.cat(batch_indices, dim=0)
reg_mask = torch.zeros(
(batch_size, max_objs),
dtype=torch.bool).to(base_centers2d_list[0].device)
gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list)
gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device)
# encode original local yaw to multibin format
orienations_target = self.bbox_coder.encode(gt_bboxes_3d)
batch_base_centers2d = base_centers2d_list[0].new_zeros(
(batch_size, max_objs, 2))
for i in range(batch_size):
reg_mask[i, :num_ctrs[i]] = 1
batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i]
flatten_reg_mask = reg_mask.flatten()
# transform base centers2d from input scale to output scale
batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio
dimensions_target = gt_bboxes_3d.tensor[:, 3:6]
labels_3d = torch.cat(gt_labels_3d_list)
keypoints2d_target = torch.cat(keypoints2d_list)
keypoints_mask = torch.cat(keypoints_mask_list)
keypoints_depth_mask = torch.cat(keypoints_depth_mask_list)
offsets2d_target = torch.cat(offsets2d_list)
bboxes2d = torch.cat(gt_bboxes_list)
# transform FCOS style bbox into [x1, y1, x2, y2] format.
bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]],
dim=-1)
depths = torch.cat(depths_list)
target_labels = dict(
base_centers2d_target=batch_base_centers2d.int(),
labels3d=labels_3d,
reg_mask=flatten_reg_mask,
batch_indices=batch_indices,
bboxes2d_target=bboxes2d_target,
depth_target=depths,
keypoints2d_target=keypoints2d_target,
keypoints_mask=keypoints_mask,
keypoints_depth_mask=keypoints_depth_mask,
orienations_target=orienations_target,
offsets2d_target=offsets2d_target,
dimensions_target=dimensions_target,
downsample_ratio=1 / width_ratio)
return center_heatmap_target, avg_factor, target_labels
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
input_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
shape (num_gt, 4).
bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
number is bbox_code_size.
shape (B, 7, H, W).
gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box.
shape (num_gts, ).
gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
truth. it is the flipped gt_bboxes
gt_labels_3d (list[Tensor]): Same as gt_labels.
centers2d (list[Tensor]): 2D centers on the image.
shape (num_gts, 2).
depths (list[Tensor]): Depth ground truth.
shape (num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
In kitti it's None.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.
Default: None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == 1
assert attr_labels is None
assert gt_bboxes_ignore is None
center2d_heatmap = cls_scores[0]
pred_reg = bbox_preds[0]
center2d_heatmap_target, avg_factor, target_labels = \
self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths,
center2d_heatmap.shape,
input_metas[0]['pad_shape'],
input_metas)
preds = self.get_predictions(
pred_reg=pred_reg,
labels3d=target_labels['labels3d'],
centers2d=target_labels['base_centers2d_target'],
reg_mask=target_labels['reg_mask'],
batch_indices=target_labels['batch_indices'],
input_metas=input_metas,
downsample_ratio=target_labels['downsample_ratio'])
# heatmap loss
loss_cls = self.loss_cls(
center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
# bbox2d regression loss
loss_bbox = self.loss_bbox(preds['bboxes2d'],
target_labels['bboxes2d_target'])
# keypoints loss, the keypoints in predictions and target are all
# local coordinates. Check the mask dtype should be bool, not int
# or float to ensure the indexing is bool index
keypoints2d_mask = target_labels['keypoints2d_mask']
loss_keypoints = self.loss_keypoints(
preds['keypoints2d'][keypoints2d_mask],
target_labels['keypoints2d_target'][keypoints2d_mask])
# orientations loss
loss_dir = self.loss_dir(preds['orientations'],
target_labels['orientations_target'])
# dimensions loss
loss_dims = self.loss_dims(preds['dimensions'],
target_labels['dimensions_target'])
# offsets for center heatmap
loss_offsets2d = self.loss_offsets2d(preds['offsets2d'],
target_labels['offsets2d_target'])
# directly regressed depth loss with direct depth uncertainty loss
direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty'])
loss_weight_1 = self.loss_direct_depth.loss_weight
loss_direct_depth = self.loss_direct_depth(
preds['direct_depth'], target_labels['depth_target'],
direct_depth_weights)
loss_uncertainty_1 =\
preds['direct_depth_uncertainty'] * loss_weight_1
loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean()
# keypoints decoded depth loss with keypoints depth uncertainty loss
depth_mask = target_labels['keypoints_depth_mask']
depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3)
valid_keypoints_depth_uncertainty = preds[
'keypoints_depth_uncertainty'][depth_mask]
valid_keypoints_depth_weights = torch.exp(
-valid_keypoints_depth_uncertainty)
loss_keypoints_depth = self.loss_keypoint_depth(
preds['keypoints_depth'][depth_mask], depth_target[depth_mask],
valid_keypoints_depth_weights)
loss_weight_2 = self.loss_keypoints_depth.loss_weight
loss_uncertainty_2 =\
valid_keypoints_depth_uncertainty * loss_weight_2
loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean()
# combined depth loss for optimiaze the uncertainty
loss_combined_depth = self.loss_combined_depth(
preds['combined_depth'], target_labels['depth_target'])
loss_dict = dict(
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_keypoints=loss_keypoints,
loss_dir=loss_dir,
loss_dims=loss_dims,
loss_offsets2d=loss_offsets2d,
loss_direct_depth=loss_direct_depth,
loss_keypoints_depth=loss_keypoints_depth,
loss_combined_depth=loss_combined_depth)
return loss_dict
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from .edge_fusion_module import EdgeFusionModule
from .transformer import GroupFree3DMHA from .transformer import GroupFree3DMHA
from .vote_module import VoteModule from .vote_module import VoteModule
__all__ = ['VoteModule', 'GroupFree3DMHA'] __all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule']
from mmcv.cnn import ConvModule
from mmcv.runner import BaseModule
from torch import nn as nn
from torch.nn import functional as F
class EdgeFusionModule(BaseModule):
"""Edge Fusion Module for feature map.
Args:
out_channels (int): The number of output channels.
feat_channels (int): The number of channels in feature map
during edge feature fusion.
kernel_size (int, optional): Kernel size of convolution.
Default: 3.
act_cfg (dict, optional): Config of activation.
Default: dict(type='ReLU').
norm_cfg (dict, optional): Config of normalization.
Default: dict(type='BN1d')).
"""
def __init__(self,
out_channels,
feat_channels,
kernel_size=3,
act_cfg=dict(type='ReLU'),
norm_cfg=dict(type='BN1d')):
super().__init__()
self.edge_convs = nn.Sequential(
ConvModule(
feat_channels,
feat_channels,
kernel_size=kernel_size,
padding=kernel_size // 2,
conv_cfg=dict(type='Conv1d'),
norm_cfg=norm_cfg,
act_cfg=act_cfg),
nn.Conv1d(feat_channels, out_channels, kernel_size=1))
self.feat_channels = feat_channels
def forward(self, features, fused_features, edge_indices, edge_lens,
output_h, output_w):
"""Forward pass.
Args:
features (torch.Tensor): Different representative features
for fusion.
fused_features (torch.Tensor): Different representative
features to be fused.
edge_indices (torch.Tensor): Batch image edge indices.
edge_lens (list[int]): List of edge length of each image.
output_h (int): Height of output feature map.
output_w (int): Width of output feature map.
Returns:
torch.Tensor: Fused feature maps.
"""
batch_size = features.shape[0]
# normalize
grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float()
grid_edge_indices[..., 0] = \
grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1
grid_edge_indices[..., 1] = \
grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1
# apply edge fusion
edge_features = F.grid_sample(
features, grid_edge_indices, align_corners=True).squeeze(-1)
edge_output = self.edge_convs(edge_features)
for k in range(batch_size):
edge_indice_k = edge_indices[k, :edge_lens[k]]
fused_features[k, :, edge_indice_k[:, 1],
edge_indice_k[:, 0]] += edge_output[
k, :, :edge_lens[k]]
return fused_features
...@@ -4,6 +4,7 @@ import torch ...@@ -4,6 +4,7 @@ import torch
def get_edge_indices(img_metas, def get_edge_indices(img_metas,
downsample_ratio,
step=1, step=1,
pad_mode='default', pad_mode='default',
dtype=np.float32, dtype=np.float32,
...@@ -17,6 +18,7 @@ def get_edge_indices(img_metas, ...@@ -17,6 +18,7 @@ def get_edge_indices(img_metas,
Args: Args:
img_metas (list[dict]): Meta information of each image, e.g., img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc. image size, scaling factor, etc.
downsample_ratio (int): Downsample ratio of output feature,
step (int, optional): Step size used for generateing step (int, optional): Step size used for generateing
edge indices. Default: 1. edge indices. Default: 1.
pad_mode (str, optional): Padding mode during data pipeline. pad_mode (str, optional): Padding mode during data pipeline.
...@@ -32,13 +34,21 @@ def get_edge_indices(img_metas, ...@@ -32,13 +34,21 @@ def get_edge_indices(img_metas,
edge_indices_list = [] edge_indices_list = []
for i in range(len(img_metas)): for i in range(len(img_metas)):
img_shape = img_metas[i]['img_shape'] img_shape = img_metas[i]['img_shape']
pad_shape = img_metas[i]['pad_shape']
h, w = img_shape[:2] h, w = img_shape[:2]
pad_h, pad_w = pad_shape
edge_indices = [] edge_indices = []
if pad_mode == 'default': if pad_mode == 'default':
x_min = 0 x_min = 0
y_min = 0 y_min = 0
x_max, y_max = w - 1, h - 1 x_max = (w - 1) // downsample_ratio
y_max = (h - 1) // downsample_ratio
elif pad_mode == 'center':
x_min = np.ceil((pad_w - w) / 2 * downsample_ratio)
y_min = np.ceil((pad_h - h) / 2 * downsample_ratio)
x_max = x_min + w // downsample_ratio
y_max = y_min + h // downsample_ratio
else: else:
raise NotImplementedError raise NotImplementedError
......
...@@ -1505,3 +1505,62 @@ def test_pgd_head(): ...@@ -1505,3 +1505,62 @@ def test_pgd_head():
assert results[0][2].shape == torch.Size([20]) assert results[0][2].shape == torch.Size([20])
assert results[0][3] is None assert results[0][3] is None
assert results[0][4].shape == torch.Size([20, 5]) assert results[0][4].shape == torch.Size([20, 5])
def test_monoflex_head():
head_cfg = dict(
type='MonoFlexHead',
num_classes=3,
in_channels=64,
use_edge_fusion=True,
edge_fusion_inds=[(1, 0)],
edge_heatmap_ratio=1 / 8,
stacked_convs=0,
feat_channels=64,
use_direction_classifier=False,
diff_rad_by_sin=False,
pred_attrs=False,
pred_velo=False,
dir_offset=0,
strides=None,
group_reg_dims=((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ),
(1, )),
cls_branch=(256, ),
reg_branch=((256, ), (256, ), (256, ), (256, ), (256, ), (256, ),
(256, ), (256, )),
num_attrs=0,
bbox_code_size=7,
dir_branch=(),
attr_branch=(),
bbox_coder=dict(
type='MonoFlexCoder',
depth_mode='exp',
base_depth=(26.494627, 16.05988),
depth_range=[0.1, 100],
combine_depth=True,
uncertainty_range=[-10, 10],
base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367, 0.1022),
(0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
(1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
dims_mode='linear',
multibin=True,
num_dir_bins=4,
bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
bin_margin=np.pi / 6,
code_size=7),
conv_bias=True,
dcn_on_last_conv=False)
self = build_head(head_cfg)
feats = [torch.rand([2, 64, 32, 32], dtype=torch.float32)]
input_metas = [
dict(img_shape=(110, 110), pad_shape=(128, 128)),
dict(img_shape=(98, 110), pad_shape=(128, 128))
]
cls_score, out_reg = self(feats, input_metas)
assert cls_score[0].shape == torch.Size([2, 3, 32, 32])
assert out_reg[0].shape == torch.Size([2, 50, 32, 32])
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch import torch
from mmcv.cnn import Scale from mmcv.cnn import Scale
from torch import nn as nn from torch import nn as nn
...@@ -596,3 +597,69 @@ def test_smoke_bbox_coder(): ...@@ -596,3 +597,69 @@ def test_smoke_bbox_coder():
locations = torch.tensor([[15., 2., 1.], [15., 2., -1.]]) locations = torch.tensor([[15., 2., 1.], [15., 2., -1.]])
orientations = bbox_coder._decode_orientation(ori_vector, locations) orientations = bbox_coder._decode_orientation(ori_vector, locations)
assert orientations.shape == torch.Size([2, 1]) assert orientations.shape == torch.Size([2, 1])
def test_monoflex_bbox_coder():
bbox_coder_cfg = dict(
type='MonoFlexCoder',
depth_mode='exp',
base_depth=(26.494627, 16.05988),
depth_range=[0.1, 100],
combine_depth=True,
uncertainty_range=[-10, 10],
base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367,
0.1022), (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
(1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
dims_mode='linear',
multibin=True,
num_dir_bins=4,
bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
bin_margin=np.pi / 6,
code_size=7)
bbox_coder = build_bbox_coder(bbox_coder_cfg)
gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([6, 7]))
orientation_target = bbox_coder.encode(gt_bboxes_3d)
assert orientation_target.shape == torch.Size([6, 8])
regression = torch.rand([100, 50])
base_centers2d = torch.rand([100, 2])
labels = torch.ones([100])
downsample_ratio = 4
cam2imgs = torch.rand([100, 4, 4])
preds = bbox_coder.decode(regression, base_centers2d, labels,
downsample_ratio, cam2imgs)
assert preds['bboxes2d'].shape == torch.Size([100, 4])
assert preds['dimensions'].shape == torch.Size([100, 3])
assert preds['offsets2d'].shape == torch.Size([100, 2])
assert preds['keypoints2d'].shape == torch.Size([100, 10, 2])
assert preds['orientations'].shape == torch.Size([100, 16])
assert preds['direct_depth'].shape == torch.Size([
100,
])
assert preds['keypoints_depth'].shape == torch.Size([100, 3])
assert preds['combined_depth'].shape == torch.Size([
100,
])
assert preds['direct_depth_uncertainty'].shape == torch.Size([
100,
])
assert preds['keypoints_depth_uncertainty'].shape == torch.Size([100, 3])
offsets_2d = torch.randn([100, 2])
depths = torch.randn([
100,
])
locations = bbox_coder.decode_location(base_centers2d, offsets_2d, depths,
cam2imgs, downsample_ratio)
assert locations.shape == torch.Size([100, 3])
orientations = torch.randn([100, 16])
yaws, local_yaws = bbox_coder.decode_orientation(orientations, locations)
assert yaws.shape == torch.Size([
100,
])
assert local_yaws.shape == torch.Size([
100,
])
...@@ -195,11 +195,15 @@ def test_points_img2cam(): ...@@ -195,11 +195,15 @@ def test_points_img2cam():
def test_generate_edge_indices(): def test_generate_edge_indices():
img_metas = [dict(img_shape=[300, 400]), dict(img_shape=[500, 450])] input_metas = [
edge_indices_list = get_edge_indices(img_metas) dict(img_shape=(110, 110), pad_shape=(128, 128)),
dict(img_shape=(98, 110), pad_shape=(128, 128))
]
downsample_ratio = 4
edge_indices_list = get_edge_indices(input_metas, downsample_ratio)
assert edge_indices_list[0].shape[0] == 1396 assert edge_indices_list[0].shape[0] == 108
assert edge_indices_list[1].shape[0] == 1896 assert edge_indices_list[1].shape[0] == 102
def test_truncation_hanlde(): def test_truncation_hanlde():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment