Unverified Commit f7356f4b authored by twang's avatar twang Committed by GitHub
Browse files

[Feature] Support FCOS3D head (#442)

* Support base mono3d dense head and anchor free mono3d head

* Support FCOS3D head

* Support FCOS3D baseline on nuScenes

* Fix an import error caused by update of mmcv/mmdet

* Change img_scale to scale_factor in the MultiScaleFlipAug in the config

* Add pred_bbox2d in the params of anchor_free_mono3d_head

* Add unit test for fcos3d head

* Fix a minor bug when setting img_metas in the unit test

* Add unit test for fcos3d detector

* Simplify the logic of weights initialization

* Add comments to specify the reason of cloning features

* Update head config
parent a0090aa1
dataset_type = 'NuScenesMonoDataset'
data_root = 'data/nuscenes/'
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
box_type_3d='Camera'),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
box_type_3d='Camera'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
box_type_3d='Camera'))
evaluation = dict(interval=2)
model = dict(
type='FCOSMono3D',
pretrained='open-mmlab://detectron2/resnet101_caffe',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='caffe'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
add_extra_convs=True,
extra_convs_on_inputs=False, # use P5
num_outs=5,
relu_before_extra_convs=True),
bbox_head=dict(
type='FCOSMono3DHead',
num_classes=10,
in_channels=256,
stacked_convs=2,
feat_channels=256,
use_direction_classifier=True,
diff_rad_by_sin=True,
pred_attrs=True,
pred_velo=True,
dir_offset=0.7854, # pi/4
strides=[8, 16, 32, 64, 128],
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo
cls_branch=(256, ),
reg_branch=(
(256, ), # offset
(256, ), # depth
(256, ), # size
(256, ), # rot
() # velo
),
dir_branch=(256, ),
attr_branch=(256, ),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
norm_on_bbox=True,
centerness_on_reg=True,
center_sampling=True,
conv_bias=True,
dcn_on_last_conv=True),
train_cfg=dict(
allowed_border=0,
code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=1000,
nms_thr=0.8,
score_thr=0.05,
min_bbox_size=0,
max_per_img=200))
_base_ = [
'../_base_/datasets/nus-mono3d.py', '../_base_/models/fcos3d.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)))
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
lr=0.002, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
total_epochs = 12
evaluation = dict(interval=2)
from .anchor3d_head import Anchor3DHead from .anchor3d_head import Anchor3DHead
from .anchor_free_mono3d_head import AnchorFreeMono3DHead
from .base_conv_bbox_head import BaseConvBboxHead from .base_conv_bbox_head import BaseConvBboxHead
from .base_mono3d_dense_head import BaseMono3DDenseHead
from .centerpoint_head import CenterHead from .centerpoint_head import CenterHead
from .fcos_mono3d_head import FCOSMono3DHead
from .free_anchor3d_head import FreeAnchor3DHead from .free_anchor3d_head import FreeAnchor3DHead
from .parta2_rpn_head import PartA2RPNHead from .parta2_rpn_head import PartA2RPNHead
from .shape_aware_head import ShapeAwareHead from .shape_aware_head import ShapeAwareHead
...@@ -9,5 +12,6 @@ from .vote_head import VoteHead ...@@ -9,5 +12,6 @@ from .vote_head import VoteHead
__all__ = [ __all__ = [
'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead' 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead'
] ]
import torch
from abc import abstractmethod
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from .base_mono3d_dense_head import BaseMono3DDenseHead
@HEADS.register_module()
class AnchorFreeMono3DHead(BaseMono3DDenseHead):
"""Anchor-free head for monocular 3D object detection.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
feat_channels (int): Number of hidden channels. Used in child classes.
stacked_convs (int): Number of stacking convs of the head.
strides (tuple): Downsample factor of each feature map.
dcn_on_last_conv (bool): If true, use dcn in the last layer of
towers. Default: False.
conv_bias (bool | str): If specified as `auto`, it will be decided by
the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
None, otherwise False. Default: "auto".
background_label (int | None): Label ID of background, set as 0 for
RPN and num_classes for other heads. It will automatically set as
num_classes if None is given.
use_direction_classifier (bool): Whether to add a direction classifier.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
loss_attr (dict): Config of attribute classifier loss, which is only
active when pred_attrs=True.
bbox_code_size (int): Dimensions of predicted bounding boxes.
pred_attrs (bool): Whether to predict attributes. Default to False.
num_attrs (int): The number of attributes to be predicted. Default: 9.
pred_velo (bool): Whether to predict velocity. Default to False.
pred_bbox2d (bool): Whether to predict 2D boxes. Default to False.
group_reg_dims (tuple[int]): The dimension of each regression target
group. Default: (2, 1, 3, 1, 2).
cls_branch (tuple[int]): Channels for classification branch.
Default: (128, 64).
reg_branch (tuple[tuple]): Channels for regression branch.
Default: (
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch (tuple[int]): Channels for direction classification branch.
Default: (64, ).
attr_branch (tuple[int]): Channels for classification branch.
Default: (64, ).
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
train_cfg (dict): Training config of anchor head.
test_cfg (dict): Testing config of anchor head.
""" # noqa: W605
_version = 1
def __init__(
self,
num_classes,
in_channels,
feat_channels=256,
stacked_convs=4,
strides=(4, 8, 16, 32, 64),
dcn_on_last_conv=False,
conv_bias='auto',
background_label=None,
use_direction_classifier=True,
diff_rad_by_sin=True,
dir_offset=0,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
bbox_code_size=9, # For nuscenes
pred_attrs=False,
num_attrs=9, # For nuscenes
pred_velo=False,
pred_bbox2d=False,
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo,
cls_branch=(128, 64),
reg_branch=(
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch=(64, ),
attr_branch=(64, ),
conv_cfg=None,
norm_cfg=None,
train_cfg=None,
test_cfg=None):
super(AnchorFreeMono3DHead, self).__init__()
self.num_classes = num_classes
self.cls_out_channels = num_classes
self.in_channels = in_channels
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.strides = strides
self.dcn_on_last_conv = dcn_on_last_conv
assert conv_bias == 'auto' or isinstance(conv_bias, bool)
self.conv_bias = conv_bias
self.use_direction_classifier = use_direction_classifier
self.diff_rad_by_sin = diff_rad_by_sin
self.dir_offset = dir_offset
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_dir = build_loss(loss_dir)
self.bbox_code_size = bbox_code_size
self.group_reg_dims = list(group_reg_dims)
self.cls_branch = cls_branch
self.reg_branch = reg_branch
assert len(reg_branch) == len(group_reg_dims), 'The number of '\
'element in reg_branch and group_reg_dims should be the same.'
self.pred_velo = pred_velo
self.pred_bbox2d = pred_bbox2d
self.out_channels = []
for reg_branch_channels in reg_branch:
if len(reg_branch_channels) > 0:
self.out_channels.append(reg_branch_channels[-1])
else:
self.out_channels.append(-1)
self.dir_branch = dir_branch
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.fp16_enabled = False
self.background_label = (
num_classes if background_label is None else background_label)
# background_label should be either 0 or num_classes
assert (self.background_label == 0
or self.background_label == num_classes)
self.pred_attrs = pred_attrs
self.attr_background_label = -1
self.num_attrs = num_attrs
if self.pred_attrs:
self.attr_background_label = num_attrs
self.loss_attr = build_loss(loss_attr)
self.attr_branch = attr_branch
self._init_layers()
def _init_layers(self):
"""Initialize layers of the head."""
self._init_cls_convs()
self._init_reg_convs()
self._init_predictor()
def _init_cls_convs(self):
"""Initialize classification conv layers of the head."""
self.cls_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
if self.dcn_on_last_conv and i == self.stacked_convs - 1:
conv_cfg = dict(type='DCNv2')
else:
conv_cfg = self.conv_cfg
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
def _init_reg_convs(self):
"""Initialize bbox regression conv layers of the head."""
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
if self.dcn_on_last_conv and i == self.stacked_convs - 1:
conv_cfg = dict(type='DCNv2')
else:
conv_cfg = self.conv_cfg
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
def _init_branch(self, conv_channels=(64), conv_strides=(1)):
"""Initialize conv layers as a prediction branch."""
conv_before_pred = nn.ModuleList()
if isinstance(conv_channels, int):
conv_channels = [self.feat_channels] + [conv_channels]
conv_strides = [conv_strides]
else:
conv_channels = [self.feat_channels] + list(conv_channels)
conv_strides = list(conv_strides)
for i in range(len(conv_strides)):
conv_before_pred.append(
ConvModule(
conv_channels[i],
conv_channels[i + 1],
3,
stride=conv_strides[i],
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
return conv_before_pred
def _init_predictor(self):
"""Initialize predictor layers of the head."""
self.conv_cls_prev = self._init_branch(
conv_channels=self.cls_branch,
conv_strides=(1, ) * len(self.cls_branch))
self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
1)
self.conv_reg_prevs = nn.ModuleList()
self.conv_regs = nn.ModuleList()
for i in range(len(self.group_reg_dims)):
reg_dim = self.group_reg_dims[i]
reg_branch_channels = self.reg_branch[i]
out_channel = self.out_channels[i]
if len(reg_branch_channels) > 0:
self.conv_reg_prevs.append(
self._init_branch(
conv_channels=reg_branch_channels,
conv_strides=(1, ) * len(reg_branch_channels)))
self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1))
else:
self.conv_reg_prevs.append(None)
self.conv_regs.append(
nn.Conv2d(self.feat_channels, reg_dim, 1))
if self.use_direction_classifier:
self.conv_dir_cls_prev = self._init_branch(
conv_channels=self.dir_branch,
conv_strides=(1, ) * len(self.dir_branch))
self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1)
if self.pred_attrs:
self.conv_attr_prev = self._init_branch(
conv_channels=self.attr_branch,
conv_strides=(1, ) * len(self.attr_branch))
self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
def init_weights(self):
"""Initialize weights of the head."""
for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]:
for m in modules:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
for conv_reg_prev in self.conv_reg_prevs:
if conv_reg_prev is None:
continue
for m in conv_reg_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
if self.use_direction_classifier:
for m in self.conv_dir_cls_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
if self.pred_attrs:
for m in self.conv_attr_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.conv_cls, std=0.01, bias=bias_cls)
for conv_reg in self.conv_regs:
normal_init(conv_reg, std=0.01)
if self.use_direction_classifier:
normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
if self.pred_attrs:
normal_init(self.conv_attr, std=0.01, bias=bias_cls)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple: Usually contain classification scores, bbox predictions, \
and direction class predictions.
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
"""
return multi_apply(self.forward_single, feats)[:5]
def forward_single(self, x):
"""Forward features of a single scale levle.
Args:
x (Tensor): FPN feature maps of the specified stride.
Returns:
tuple: Scores for each class, bbox predictions, direction class,
and attributes, features after classification and regression
conv layers, some models needs these features like FCOS.
"""
cls_feat = x
reg_feat = x
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
# clone the cls_feat for reusing the feature map afterwards
clone_cls_feat = cls_feat.clone()
for conv_cls_prev_layer in self.conv_cls_prev:
clone_cls_feat = conv_cls_prev_layer(clone_cls_feat)
cls_score = self.conv_cls(clone_cls_feat)
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
bbox_pred = []
for i in range(len(self.group_reg_dims)):
# clone the reg_feat for reusing the feature map afterwards
clone_reg_feat = reg_feat.clone()
if len(self.reg_branch[i]) > 0:
for conv_reg_prev_layer in self.conv_reg_prevs[i]:
clone_reg_feat = conv_reg_prev_layer(clone_reg_feat)
bbox_pred.append(self.conv_regs[i](clone_reg_feat))
bbox_pred = torch.cat(bbox_pred, dim=1)
dir_cls_pred = None
if self.use_direction_classifier:
clone_reg_feat = reg_feat.clone()
for conv_dir_cls_prev_layer in self.conv_dir_cls_prev:
clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat)
dir_cls_pred = self.conv_dir_cls(clone_reg_feat)
attr_pred = None
if self.pred_attrs:
# clone the cls_feat for reusing the feature map afterwards
clone_cls_feat = cls_feat.clone()
for conv_attr_prev_layer in self.conv_attr_prev:
clone_cls_feat = conv_attr_prev_layer(clone_cls_feat)
attr_pred = self.conv_attr(clone_cls_feat)
return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \
reg_feat
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each
image with shape (num_gts, bbox_code_size).
gt_labels_3d (list[Tensor]): 3D class indices of each box.
centers2d (list[Tensor]): Projected 3D centers onto 2D images.
depths (list[Tensor]): Depth of projected centers on 2D images.
attr_labels (list[Tensor], optional): Attribute indices
corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
"""
raise NotImplementedError
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * bbox_code_size, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
"""
raise NotImplementedError
@abstractmethod
def get_targets(self, points, gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
depths_list, attr_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
image, each has shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
box, each has shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
each has shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
attr_labels_list (list[Tensor]): Attribute labels of each box,
each has shape (num_gt,).
"""
raise NotImplementedError
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points of a single scale level."""
h, w = featmap_size
x_range = torch.arange(w, dtype=dtype, device=device)
y_range = torch.arange(h, dtype=dtype, device=device)
y, x = torch.meshgrid(y_range, x_range)
if flatten:
y = y.flatten()
x = x.flatten()
return y, x
def get_points(self, featmap_sizes, dtype, device, flatten=False):
"""Get points according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
Returns:
tuple: points of each image.
"""
mlvl_points = []
for i in range(len(featmap_sizes)):
mlvl_points.append(
self._get_points_single(featmap_sizes[i], self.strides[i],
dtype, device, flatten))
return mlvl_points
from abc import ABCMeta, abstractmethod
from torch import nn as nn
class BaseMono3DDenseHead(nn.Module, metaclass=ABCMeta):
"""Base class for Monocular 3D DenseHeads."""
def __init__(self):
super(BaseMono3DDenseHead, self).__init__()
@abstractmethod
def loss(self, **kwargs):
"""Compute losses of the head."""
pass
@abstractmethod
def get_bboxes(self, **kwargs):
"""Transform network output for a batch into bbox predictions."""
pass
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
centers2d=None,
depths=None,
attr_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
"""
Args:
x (list[Tensor]): Features from FPN.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
shape (num_gts, self.bbox_code_size).
gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
shape (num_gts,).
centers2d (list[Tensor]): Projected 3D center of each box,
shape (num_gts, 2).
depths (list[Tensor]): Depth of projected 3D center of each box,
shape (num_gts,).
attr_labels (list[Tensor]): Attribute labels of each box,
shape (num_gts,).
gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
Returns:
tuple:
losses: (dict[str, Tensor]): A dictionary of loss components.
proposal_list (list[Tensor]): Proposals of each image.
"""
outs = self(x)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
attr_labels, img_metas)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels,
img_metas)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
if proposal_cfg is None:
return losses
else:
proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
return losses, proposal_list
import numpy as np
import torch
from mmcv.cnn import Scale, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from .anchor_free_mono3d_head import AnchorFreeMono3DHead
INF = 1e8
@HEADS.register_module()
class FCOSMono3DHead(AnchorFreeMono3DHead):
"""Anchor-free head used in FCOS3D.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
level points.
center_sampling (bool): If true, use center sampling. Default: True.
center_sample_radius (float): Radius of center sampling. Default: 1.5.
norm_on_bbox (bool): If true, normalize the regression targets
with FPN strides. Default: True.
centerness_on_reg (bool): If true, position centerness on the
regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
Default: True.
centerness_alpha: Parameter used to adjust the intensity attenuation
from the center to the periphery. Default: 2.5.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classification loss.
loss_attr (dict): Config of attribute classification loss.
loss_centerness (dict): Config of centerness loss.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
centerness_branch (tuple[int]): Channels for centerness branch.
Default: (64, ).
""" # noqa: E501
def __init__(self,
num_classes,
in_channels,
regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384),
(384, INF)),
center_sampling=True,
center_sample_radius=1.5,
norm_on_bbox=True,
centerness_on_reg=True,
centerness_alpha=2.5,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
centerness_branch=(64, ),
**kwargs):
self.regress_ranges = regress_ranges
self.center_sampling = center_sampling
self.center_sample_radius = center_sample_radius
self.norm_on_bbox = norm_on_bbox
self.centerness_on_reg = centerness_on_reg
self.centerness_alpha = centerness_alpha
self.centerness_branch = centerness_branch
super().__init__(
num_classes,
in_channels,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_dir=loss_dir,
loss_attr=loss_attr,
norm_cfg=norm_cfg,
**kwargs)
self.loss_centerness = build_loss(loss_centerness)
def _init_layers(self):
"""Initialize layers of the head."""
super()._init_layers()
self.conv_centerness_prev = self._init_branch(
conv_channels=self.centerness_branch,
conv_strides=(1, ) * len(self.centerness_branch))
self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
self.scales = nn.ModuleList([
nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides
]) # only for offset, depth and size regression
def init_weights(self):
"""Initialize weights of the head."""
super().init_weights()
for m in self.conv_centerness_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
normal_init(self.conv_centerness, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2).
attr_preds (list[Tensor]): Attribute scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level,
each is a 4D-tensor, the channel number is num_points * 1.
"""
return multi_apply(self.forward_single, feats, self.scales,
self.strides)
def forward_single(self, x, scale, stride):
"""Forward features of a single scale levle.
Args:
x (Tensor): FPN feature maps of the specified stride.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
stride (int): The corresponding stride for feature maps, only
used to normalize the bbox prediction when self.norm_on_bbox
is True.
Returns:
tuple: scores for each class, bbox and direction class \
predictions, centerness predictions of input feature maps.
"""
cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
super().forward_single(x)
if self.centerness_on_reg:
clone_reg_feat = reg_feat.clone()
for conv_centerness_prev_layer in self.conv_centerness_prev:
clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat)
centerness = self.conv_centerness(clone_reg_feat)
else:
clone_cls_feat = cls_feat.clone()
for conv_centerness_prev_layer in self.conv_centerness_prev:
clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
centerness = self.conv_centerness(clone_cls_feat)
# scale the bbox_pred of different level
# only apply to offset, depth and size prediction
scale_offset, scale_depth, scale_size = scale[0:3]
clone_bbox_pred = bbox_pred.clone()
bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float()
bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float()
bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float()
bbox_pred[:, 2] = bbox_pred[:, 2].exp()
bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6 # avoid size=0
assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
'has not been thoroughly tested for FCOS3D.'
if self.norm_on_bbox:
if not self.training:
# Note that this line is conducted only when testing
bbox_pred[:, :2] *= stride
return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness
@staticmethod
def add_sin_difference(boxes1, boxes2):
"""Convert the rotation difference to difference in sine function.
Args:
boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
and the 7th dimension is rotation dimension.
boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
the 7th dimension is rotation dimension.
Returns:
tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \
dimensions are changed.
"""
rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
boxes2[..., 6:7])
rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
6:7])
boxes1 = torch.cat(
[boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
dim=-1)
return boxes1, boxes2
@staticmethod
def get_direction_target(reg_targets,
dir_offset=0,
num_bins=2,
one_hot=True):
"""Encode direction to 0 ~ num_bins-1.
Args:
reg_targets (torch.Tensor): Bbox regression targets.
dir_offset (int): Direction offset.
num_bins (int): Number of bins to divide 2*PI.
one_hot (bool): Whether to encode as one hot.
Returns:
torch.Tensor: Encoded direction targets.
"""
rot_gt = reg_targets[..., 6]
offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi)
dir_cls_targets = torch.floor(offset_rot /
(2 * np.pi / num_bins)).long()
dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
if one_hot:
dir_targets = torch.zeros(
*list(dir_cls_targets.shape),
num_bins,
dtype=reg_targets.dtype,
device=dir_cls_targets.device)
dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
dir_cls_targets = dir_targets
return dir_cls_targets
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
'centernesses'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
centernesses,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level, each
is a 4D-tensor, the channel number is num_points * 1.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
(num_gts, code_size).
gt_labels_3d (list[Tensor]): same as gt_labels
centers2d (list[Tensor]): 2D centers on the image with shape of
(num_gts, 2).
depths (list[Tensor]): Depth ground truth with shape of
(num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len(
attr_preds)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
self.get_targets(
all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels)
num_imgs = cls_scores[0].size(0)
# flatten cls_scores, bbox_preds, dir_cls_preds and centerness
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
for bbox_pred in bbox_preds
]
flatten_dir_cls_preds = [
dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
for dir_cls_pred in dir_cls_preds
]
flatten_centerness = [
centerness.permute(0, 2, 3, 1).reshape(-1)
for centerness in centernesses
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
flatten_centerness = torch.cat(flatten_centerness)
flatten_labels_3d = torch.cat(labels_3d)
flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
flatten_centerness_targets = torch.cat(centerness_targets)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
bg_class_ind = self.num_classes
pos_inds = ((flatten_labels_3d >= 0)
& (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
num_pos = len(pos_inds)
loss_cls = self.loss_cls(
flatten_cls_scores,
flatten_labels_3d,
avg_factor=num_pos + num_imgs) # avoid num_pos is 0
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
pos_centerness = flatten_centerness[pos_inds]
if self.pred_attrs:
flatten_attr_preds = [
attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
for attr_pred in attr_preds
]
flatten_attr_preds = torch.cat(flatten_attr_preds)
flatten_attr_targets = torch.cat(attr_targets)
pos_attr_preds = flatten_attr_preds[pos_inds]
if num_pos > 0:
pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
pos_centerness_targets = flatten_centerness_targets[pos_inds]
if self.pred_attrs:
pos_attr_targets = flatten_attr_targets[pos_inds]
bbox_weights = pos_centerness_targets.new_ones(
len(pos_centerness_targets), sum(self.group_reg_dims))
equal_weights = pos_centerness_targets.new_ones(
pos_centerness_targets.shape)
code_weight = self.train_cfg.get('code_weight', None)
if code_weight:
assert len(code_weight) == sum(self.group_reg_dims)
bbox_weights = bbox_weights * bbox_weights.new_tensor(
code_weight)
if self.use_direction_classifier:
pos_dir_cls_targets = self.get_direction_target(
pos_bbox_targets_3d, self.dir_offset, one_hot=False)
if self.diff_rad_by_sin:
pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
pos_bbox_preds, pos_bbox_targets_3d)
loss_offset = self.loss_bbox(
pos_bbox_preds[:, :2],
pos_bbox_targets_3d[:, :2],
weight=bbox_weights[:, :2],
avg_factor=equal_weights.sum())
loss_depth = self.loss_bbox(
pos_bbox_preds[:, 2],
pos_bbox_targets_3d[:, 2],
weight=bbox_weights[:, 2],
avg_factor=equal_weights.sum())
loss_size = self.loss_bbox(
pos_bbox_preds[:, 3:6],
pos_bbox_targets_3d[:, 3:6],
weight=bbox_weights[:, 3:6],
avg_factor=equal_weights.sum())
loss_rotsin = self.loss_bbox(
pos_bbox_preds[:, 6],
pos_bbox_targets_3d[:, 6],
weight=bbox_weights[:, 6],
avg_factor=equal_weights.sum())
loss_velo = None
if self.pred_velo:
loss_velo = self.loss_bbox(
pos_bbox_preds[:, 7:9],
pos_bbox_targets_3d[:, 7:9],
weight=bbox_weights[:, 7:9],
avg_factor=equal_weights.sum())
loss_centerness = self.loss_centerness(pos_centerness,
pos_centerness_targets)
# direction classification loss
loss_dir = None
# TODO: add more check for use_direction_classifier
if self.use_direction_classifier:
loss_dir = self.loss_dir(
pos_dir_cls_preds,
pos_dir_cls_targets,
equal_weights,
avg_factor=equal_weights.sum())
# attribute classification loss
loss_attr = None
if self.pred_attrs:
loss_attr = self.loss_attr(
pos_attr_preds,
pos_attr_targets,
pos_centerness_targets,
avg_factor=pos_centerness_targets.sum())
else:
# need absolute due to possible negative delta x/y
loss_offset = pos_bbox_preds[:, :2].sum()
loss_depth = pos_bbox_preds[:, 2].sum()
loss_size = pos_bbox_preds[:, 3:6].sum()
loss_rotsin = pos_bbox_preds[:, 6].sum()
loss_velo = None
if self.pred_velo:
loss_velo = pos_bbox_preds[:, 7:9].sum()
loss_centerness = pos_centerness.sum()
loss_dir = None
if self.use_direction_classifier:
loss_dir = pos_dir_cls_preds.sum()
loss_attr = None
if self.pred_attrs:
loss_attr = pos_attr_preds.sum()
loss_dict = dict(
loss_cls=loss_cls,
loss_offset=loss_offset,
loss_depth=loss_depth,
loss_size=loss_size,
loss_rotsin=loss_rotsin,
loss_centerness=loss_centerness)
if loss_velo is not None:
loss_dict['loss_velo'] = loss_velo
if loss_dir is not None:
loss_dict['loss_dir'] = loss_dir
if loss_attr is not None:
loss_dict['loss_attr'] = loss_attr
return loss_dict
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
'centernesses'))
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
centernesses,
img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * 4, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
centernesses (list[Tensor]): Centerness for each scale level with
shape (N, num_points * 1, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
The first item is an (n, 5) tensor, where the first 4 columns \
are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
5-th column is a score between 0 and 1. The second item is a \
(n,) tensor where each item is the predicted class label of \
the corresponding box.
"""
assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
len(centernesses) == len(attr_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
if self.use_direction_classifier:
dir_cls_pred_list = [
dir_cls_preds[i][img_id].detach()
for i in range(num_levels)
]
else:
dir_cls_pred_list = [
cls_scores[i][img_id].new_full(
[2, *cls_scores[i][img_id].shape[1:]], 0).detach()
for i in range(num_levels)
]
if self.pred_attrs:
attr_pred_list = [
attr_preds[i][img_id].detach() for i in range(num_levels)
]
else:
attr_pred_list = [
cls_scores[i][img_id].new_full(
[self.num_attrs, *cls_scores[i][img_id].shape[1:]],
self.attr_background_label).detach()
for i in range(num_levels)
]
centerness_pred_list = [
centernesses[i][img_id].detach() for i in range(num_levels)
]
input_meta = img_metas[img_id]
det_bboxes = self._get_bboxes_single(
cls_score_list, bbox_pred_list, dir_cls_pred_list,
attr_pred_list, centerness_pred_list, mlvl_points, input_meta,
cfg, rescale)
result_list.append(det_bboxes)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
centernesses,
mlvl_points,
input_meta,
cfg,
rescale=False):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for a single scale level
Has shape (num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for a single scale
level with shape (num_points * bbox_code_size, H, W).
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on a single scale level with shape \
(num_points * 2, H, W)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
centernesses (list[Tensor]): Centerness for a single scale level
with shape (num_points, H, W).
mlvl_points (list[Tensor]): Box reference for a single scale level
with shape (num_total_points, 2).
input_meta (dict): Metadata of input image.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Returns:
tuples[Tensor]: Predicted 3D boxes, scores, labels and attributes.
"""
view = np.array(input_meta['cam_intrinsic'])
scale_factor = input_meta['scale_factor']
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_centers2d = []
mlvl_bboxes = []
mlvl_scores = []
mlvl_dir_scores = []
mlvl_attr_scores = []
mlvl_centerness = []
for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
points in zip(cls_scores, bbox_preds, dir_cls_preds,
attr_preds, centernesses, mlvl_points):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
attr_score = torch.max(attr_pred, dim=-1)[1]
centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
bbox_pred = bbox_pred.permute(1, 2,
0).reshape(-1,
sum(self.group_reg_dims))
bbox_pred = bbox_pred[:, :self.bbox_code_size]
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
max_scores, _ = (scores * centerness[:, None]).max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
dir_cls_pred = dir_cls_pred[topk_inds, :]
centerness = centerness[topk_inds]
dir_cls_score = dir_cls_score[topk_inds]
attr_score = attr_score[topk_inds]
# change the offset to actual center predictions
bbox_pred[:, :2] = points - bbox_pred[:, :2]
if rescale:
bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor)
pred_center2d = bbox_pred[:, :3].clone()
bbox_pred[:, :3] = self.pts2Dto3D(bbox_pred[:, :3], view)
mlvl_centers2d.append(pred_center2d)
mlvl_bboxes.append(bbox_pred)
mlvl_scores.append(scores)
mlvl_dir_scores.append(dir_cls_score)
mlvl_attr_scores.append(attr_score)
mlvl_centerness.append(centerness)
mlvl_centers2d = torch.cat(mlvl_centers2d)
mlvl_bboxes = torch.cat(mlvl_bboxes)
mlvl_dir_scores = torch.cat(mlvl_dir_scores)
# change local yaw to global yaw for 3D nms
if mlvl_bboxes.shape[0] > 0:
dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0,
np.pi)
mlvl_bboxes[..., 6] = (
dir_rot + self.dir_offset +
np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype))
cam_intrinsic = mlvl_centers2d.new_zeros((4, 4))
cam_intrinsic[:view.shape[0], :view.shape[1]] = \
mlvl_centers2d.new_tensor(view)
mlvl_bboxes[:, 6] = torch.atan2(
mlvl_centers2d[:, 0] - cam_intrinsic[0, 2],
cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6]
mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
mlvl_bboxes, box_dim=self.bbox_code_size,
origin=(0.5, 0.5, 0.5)).bev)
mlvl_scores = torch.cat(mlvl_scores)
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
mlvl_attr_scores = torch.cat(mlvl_attr_scores)
mlvl_centerness = torch.cat(mlvl_centerness)
# no scale_factors in box3d_multiclass_nms
# Then we multiply it from outside
mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_nms_scores, cfg.score_thr,
cfg.max_per_img, cfg, mlvl_dir_scores,
mlvl_attr_scores)
bboxes, scores, labels, dir_scores, attrs = results
attrs = attrs.to(labels.dtype) # change data type to int
bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.bbox_code_size)
# Note that the predictions use origin (0.5, 0.5, 0.5)
# Due to the ground truth centers2d are the gravity center of objects
# The center has been transformed when computing bev bbox!!!!
if not self.pred_attrs:
attrs = None
return bboxes, scores, labels, attrs
@staticmethod
def pts2Dto3D(points, view):
"""
Args:
points (torch.Tensor): points in 2D images, [N, 3], \
3 corresponds with x, y in the image and depth.
view (np.ndarray): camera instrinsic, [3, 3]
Returns:
torch.Tensor: points in 3D space. [N, 3], \
3 corresponds with x, y, z in 3D space.
"""
assert view.shape[0] <= 4
assert view.shape[1] <= 4
assert points.shape[1] == 3
points2D = points[:, :2]
depths = points[:, 2].view(-1, 1)
unnorm_points2D = torch.cat([points2D * depths, depths], dim=1)
viewpad = torch.eye(4, dtype=points2D.dtype, device=points2D.device)
viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view)
inv_viewpad = torch.inverse(viewpad).transpose(0, 1)
# Do operation in homogenous coordinates.
nbr_points = unnorm_points2D.shape[0]
homo_points2D = torch.cat(
[unnorm_points2D,
points2D.new_ones((nbr_points, 1))], dim=1)
points3D = torch.mm(homo_points2D, inv_viewpad)[:, :3]
return points3D
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points according to feature map sizes."""
y, x = super()._get_points_single(featmap_size, stride, dtype, device)
points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
dim=-1) + stride // 2
return points
def get_targets(self, points, gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
depths_list, attr_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
image, each has shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
box, each has shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
each has shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
attr_labels_list (list[Tensor]): Attribute labels of each box,
each has shape (num_gt,).
Returns:
tuple:
concat_lvl_labels (list[Tensor]): Labels of each level. \
concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
level.
"""
assert len(points) == len(self.regress_ranges)
num_levels = len(points)
# expand regress ranges to align with points
expanded_regress_ranges = [
points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
points[i]) for i in range(num_levels)
]
# concat all levels points and regress ranges
concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
concat_points = torch.cat(points, dim=0)
# the number of points per img, per lvl
num_points = [center.size(0) for center in points]
if attr_labels_list is None:
attr_labels_list = [
gt_labels.new_full(gt_labels.shape, self.attr_background_label)
for gt_labels in gt_labels_list
]
# get labels and bbox_targets of each image
_, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \
attr_targets_list = multi_apply(
self._get_target_single,
gt_bboxes_list,
gt_labels_list,
gt_bboxes_3d_list,
gt_labels_3d_list,
centers2d_list,
depths_list,
attr_labels_list,
points=concat_points,
regress_ranges=concat_regress_ranges,
num_points_per_lvl=num_points)
# split to per img, per level
labels_3d_list = [
labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
]
bbox_targets_3d_list = [
bbox_targets_3d.split(num_points, 0)
for bbox_targets_3d in bbox_targets_3d_list
]
centerness_targets_list = [
centerness_targets.split(num_points, 0)
for centerness_targets in centerness_targets_list
]
attr_targets_list = [
attr_targets.split(num_points, 0)
for attr_targets in attr_targets_list
]
# concat per level image
concat_lvl_labels_3d = []
concat_lvl_bbox_targets_3d = []
concat_lvl_centerness_targets = []
concat_lvl_attr_targets = []
for i in range(num_levels):
concat_lvl_labels_3d.append(
torch.cat([labels[i] for labels in labels_3d_list]))
concat_lvl_centerness_targets.append(
torch.cat([
centerness_targets[i]
for centerness_targets in centerness_targets_list
]))
bbox_targets_3d = torch.cat([
bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
])
concat_lvl_attr_targets.append(
torch.cat(
[attr_targets[i] for attr_targets in attr_targets_list]))
if self.norm_on_bbox:
bbox_targets_3d[:, :
2] = bbox_targets_3d[:, :2] / self.strides[i]
concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
concat_lvl_centerness_targets, concat_lvl_attr_targets
def _get_target_single(self, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels,
points, regress_ranges, num_points_per_lvl):
"""Compute regression and classification targets for a single image."""
num_points = points.size(0)
num_gts = gt_labels.size(0)
if not isinstance(gt_bboxes_3d, torch.Tensor):
gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device)
if num_gts == 0:
return gt_labels.new_full((num_points,), self.background_label), \
gt_bboxes.new_zeros((num_points, 4)), \
gt_labels_3d.new_full(
(num_points,), self.background_label), \
gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \
gt_bboxes_3d.new_zeros((num_points,)), \
attr_labels.new_full(
(num_points,), self.attr_background_label)
areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
gt_bboxes[:, 3] - gt_bboxes[:, 1])
areas = areas[None].repeat(num_points, 1)
regress_ranges = regress_ranges[:, None, :].expand(
num_points, num_gts, 2)
gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
centers2d = centers2d[None].expand(num_points, num_gts, 2)
gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts,
self.bbox_code_size)
depths = depths[None, :, None].expand(num_points, num_gts, 1)
xs, ys = points[:, 0], points[:, 1]
xs = xs[:, None].expand(num_points, num_gts)
ys = ys[:, None].expand(num_points, num_gts)
delta_xs = (xs - centers2d[..., 0])[..., None]
delta_ys = (ys - centers2d[..., 1])[..., None]
bbox_targets_3d = torch.cat(
(delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1)
left = xs - gt_bboxes[..., 0]
right = gt_bboxes[..., 2] - xs
top = ys - gt_bboxes[..., 1]
bottom = gt_bboxes[..., 3] - ys
bbox_targets = torch.stack((left, top, right, bottom), -1)
assert self.center_sampling is True, 'Setting center_sampling to '\
'False has not been implemented for FCOS3D.'
# condition1: inside a `center bbox`
radius = self.center_sample_radius
center_xs = centers2d[..., 0]
center_ys = centers2d[..., 1]
center_gts = torch.zeros_like(gt_bboxes)
stride = center_xs.new_zeros(center_xs.shape)
# project the points on current lvl back to the `original` sizes
lvl_begin = 0
for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
lvl_end = lvl_begin + num_points_lvl
stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
lvl_begin = lvl_end
center_gts[..., 0] = center_xs - stride
center_gts[..., 1] = center_ys - stride
center_gts[..., 2] = center_xs + stride
center_gts[..., 3] = center_ys + stride
cb_dist_left = xs - center_gts[..., 0]
cb_dist_right = center_gts[..., 2] - xs
cb_dist_top = ys - center_gts[..., 1]
cb_dist_bottom = center_gts[..., 3] - ys
center_bbox = torch.stack(
(cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
# condition2: limit the regression range for each location
max_regress_distance = bbox_targets.max(-1)[0]
inside_regress_range = (
(max_regress_distance >= regress_ranges[..., 0])
& (max_regress_distance <= regress_ranges[..., 1]))
# center-based criterion to deal with ambiguity
dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1))
dists[inside_gt_bbox_mask == 0] = INF
dists[inside_regress_range == 0] = INF
min_dist, min_dist_inds = dists.min(dim=1)
labels = gt_labels[min_dist_inds]
labels_3d = gt_labels_3d[min_dist_inds]
attr_labels = attr_labels[min_dist_inds]
labels[min_dist == INF] = self.background_label # set as BG
labels_3d[min_dist == INF] = self.background_label # set as BG
attr_labels[min_dist == INF] = self.attr_background_label
bbox_targets = bbox_targets[range(num_points), min_dist_inds]
bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds]
relative_dists = torch.sqrt(
torch.sum(bbox_targets_3d[..., :2]**2,
dim=-1)) / (1.414 * stride[:, 0])
# [N, 1] / [N, 1]
centerness_targets = torch.exp(-self.centerness_alpha * relative_dists)
return labels, bbox_targets, labels_3d, bbox_targets_3d, \
centerness_targets, attr_labels
...@@ -5,7 +5,8 @@ import random ...@@ -5,7 +5,8 @@ import random
import torch import torch
from os.path import dirname, exists, join from os.path import dirname, exists, join
from mmdet3d.core.bbox import DepthInstance3DBoxes, LiDARInstance3DBoxes from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
LiDARInstance3DBoxes)
from mmdet3d.models.builder import build_detector from mmdet3d.models.builder import build_detector
...@@ -316,3 +317,56 @@ def test_centerpoint(): ...@@ -316,3 +317,56 @@ def test_centerpoint():
assert boxes_3d_0.tensor.shape[1] == 9 assert boxes_3d_0.tensor.shape[1] == 9
assert scores_3d_0.shape[0] >= 0 assert scores_3d_0.shape[0] >= 0
assert labels_3d_0.shape[0] >= 0 assert labels_3d_0.shape[0] >= 0
def test_fcos3d():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
fcos3d_cfg = _get_detector_cfg(
'fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py')
self = build_detector(fcos3d_cfg).cuda()
imgs = torch.rand([1, 3, 928, 1600], dtype=torch.float32).cuda()
gt_bboxes = [torch.rand([3, 4], dtype=torch.float32).cuda()]
gt_bboxes_3d = CameraInstance3DBoxes(
torch.rand([3, 9], device='cuda'), box_dim=9)
gt_labels = [torch.randint(0, 10, [3], device='cuda')]
gt_labels_3d = gt_labels
centers2d = [torch.rand([3, 2], dtype=torch.float32).cuda()]
depths = [torch.rand([3], dtype=torch.float32).cuda()]
attr_labels = [torch.randint(0, 9, [3], device='cuda')]
img_metas = [
dict(
cam_intrinsic=[[1260.8474446004698, 0.0, 807.968244525554],
[0.0, 1260.8474446004698, 495.3344268742088],
[0.0, 0.0, 1.0]],
scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
box_type_3d=CameraInstance3DBoxes)
]
# test forward_train
losses = self.forward_train(imgs, img_metas, gt_bboxes, gt_labels,
gt_bboxes_3d, gt_labels_3d, centers2d, depths,
attr_labels)
assert losses['loss_cls'] >= 0
assert losses['loss_offset'] >= 0
assert losses['loss_depth'] >= 0
assert losses['loss_size'] >= 0
assert losses['loss_rotsin'] >= 0
assert losses['loss_centerness'] >= 0
assert losses['loss_velo'] >= 0
assert losses['loss_dir'] >= 0
assert losses['loss_attr'] >= 0
# test simple_test
results = self.simple_test(imgs, img_metas)
boxes_3d = results[0]['img_bbox']['boxes_3d']
scores_3d = results[0]['img_bbox']['scores_3d']
labels_3d = results[0]['img_bbox']['labels_3d']
attrs_3d = results[0]['img_bbox']['attrs_3d']
assert boxes_3d.tensor.shape[0] >= 0
assert boxes_3d.tensor.shape[1] == 9
assert scores_3d.shape[0] >= 0
assert labels_3d.shape[0] >= 0
assert attrs_3d.shape[0] >= 0
...@@ -5,8 +5,8 @@ import random ...@@ -5,8 +5,8 @@ import random
import torch import torch
from os.path import dirname, exists, join from os.path import dirname, exists, join
from mmdet3d.core.bbox import (Box3DMode, DepthInstance3DBoxes, from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes,
LiDARInstance3DBoxes) DepthInstance3DBoxes, LiDARInstance3DBoxes)
from mmdet3d.models.builder import build_head from mmdet3d.models.builder import build_head
from mmdet.apis import set_random_seed from mmdet.apis import set_random_seed
...@@ -1044,3 +1044,73 @@ def test_shape_aware_head_getboxes(): ...@@ -1044,3 +1044,73 @@ def test_shape_aware_head_getboxes():
input_metas) input_metas)
assert len(result_list[0][1]) > 0 # ensure not all boxes are filtered assert len(result_list[0][1]) > 0 # ensure not all boxes are filtered
assert (result_list[0][1] > 0.3).all() assert (result_list[0][1] > 0.3).all()
def test_fcos_mono3d_head():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
fcos3d_head_cfg = _get_head_cfg(
'fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py')
self = build_head(fcos3d_head_cfg).cuda()
feats = [
torch.rand([2, 256, 116, 200], dtype=torch.float32).cuda(),
torch.rand([2, 256, 58, 100], dtype=torch.float32).cuda(),
torch.rand([2, 256, 29, 50], dtype=torch.float32).cuda(),
torch.rand([2, 256, 15, 25], dtype=torch.float32).cuda(),
torch.rand([2, 256, 8, 13], dtype=torch.float32).cuda()
]
# test forward
ret_dict = self(feats)
assert len(ret_dict) == 5
assert len(ret_dict[0]) == 5
assert ret_dict[0][0].shape == torch.Size([2, 10, 116, 200])
# test loss
gt_bboxes = [
torch.rand([3, 4], dtype=torch.float32).cuda(),
torch.rand([3, 4], dtype=torch.float32).cuda()
]
gt_bboxes_3d = CameraInstance3DBoxes(
torch.rand([3, 9], device='cuda'), box_dim=9)
gt_labels = [torch.randint(0, 10, [3], device='cuda') for i in range(2)]
gt_labels_3d = gt_labels
centers2d = [
torch.rand([3, 2], dtype=torch.float32).cuda(),
torch.rand([3, 2], dtype=torch.float32).cuda()
]
depths = [
torch.rand([3], dtype=torch.float32).cuda(),
torch.rand([3], dtype=torch.float32).cuda()
]
attr_labels = [torch.randint(0, 9, [3], device='cuda') for i in range(2)]
img_metas = [
dict(
cam_intrinsic=[[1260.8474446004698, 0.0, 807.968244525554],
[0.0, 1260.8474446004698, 495.3344268742088],
[0.0, 0.0, 1.0]],
scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
box_type_3d=CameraInstance3DBoxes) for i in range(2)
]
losses = self.loss(*ret_dict, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels, img_metas)
assert losses['loss_cls'] >= 0
assert losses['loss_offset'] >= 0
assert losses['loss_depth'] >= 0
assert losses['loss_size'] >= 0
assert losses['loss_rotsin'] >= 0
assert losses['loss_centerness'] >= 0
assert losses['loss_velo'] >= 0
assert losses['loss_dir'] >= 0
assert losses['loss_attr'] >= 0
# test get_boxes
results = self.get_bboxes(*ret_dict, img_metas)
assert len(results) == 2
assert len(results[0]) == 4
assert results[0][0].tensor.shape == torch.Size([200, 9])
assert results[0][1].shape == torch.Size([200])
assert results[0][2].shape == torch.Size([200])
assert results[0][3].shape == torch.Size([200])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment