Unverified Commit f7356f4b authored by twang's avatar twang Committed by GitHub
Browse files

[Feature] Support FCOS3D head (#442)

* Support base mono3d dense head and anchor free mono3d head

* Support FCOS3D head

* Support FCOS3D baseline on nuScenes

* Fix an import error caused by update of mmcv/mmdet

* Change img_scale to scale_factor in the MultiScaleFlipAug in the config

* Add pred_bbox2d in the params of anchor_free_mono3d_head

* Add unit test for fcos3d head

* Fix a minor bug when setting img_metas in the unit test

* Add unit test for fcos3d detector

* Simplify the logic of weights initialization

* Add comments to specify the reason of cloning features

* Update head config
parent a0090aa1
dataset_type = 'NuScenesMonoDataset'
data_root = 'data/nuscenes/'
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
box_type_3d='Camera'),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
box_type_3d='Camera'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
box_type_3d='Camera'))
evaluation = dict(interval=2)
model = dict(
type='FCOSMono3D',
pretrained='open-mmlab://detectron2/resnet101_caffe',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='caffe'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
add_extra_convs=True,
extra_convs_on_inputs=False, # use P5
num_outs=5,
relu_before_extra_convs=True),
bbox_head=dict(
type='FCOSMono3DHead',
num_classes=10,
in_channels=256,
stacked_convs=2,
feat_channels=256,
use_direction_classifier=True,
diff_rad_by_sin=True,
pred_attrs=True,
pred_velo=True,
dir_offset=0.7854, # pi/4
strides=[8, 16, 32, 64, 128],
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo
cls_branch=(256, ),
reg_branch=(
(256, ), # offset
(256, ), # depth
(256, ), # size
(256, ), # rot
() # velo
),
dir_branch=(256, ),
attr_branch=(256, ),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
norm_on_bbox=True,
centerness_on_reg=True,
center_sampling=True,
conv_bias=True,
dcn_on_last_conv=True),
train_cfg=dict(
allowed_border=0,
code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=1000,
nms_thr=0.8,
score_thr=0.05,
min_bbox_size=0,
max_per_img=200))
_base_ = [
'../_base_/datasets/nus-mono3d.py', '../_base_/models/fcos3d.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)))
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='MultiScaleFlipAug',
scale_factor=1.0,
flip=False,
transforms=[
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
lr=0.002, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(
_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
total_epochs = 12
evaluation = dict(interval=2)
from .anchor3d_head import Anchor3DHead
from .anchor_free_mono3d_head import AnchorFreeMono3DHead
from .base_conv_bbox_head import BaseConvBboxHead
from .base_mono3d_dense_head import BaseMono3DDenseHead
from .centerpoint_head import CenterHead
from .fcos_mono3d_head import FCOSMono3DHead
from .free_anchor3d_head import FreeAnchor3DHead
from .parta2_rpn_head import PartA2RPNHead
from .shape_aware_head import ShapeAwareHead
......@@ -9,5 +12,6 @@ from .vote_head import VoteHead
__all__ = [
'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead'
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead'
]
import torch
from abc import abstractmethod
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from .base_mono3d_dense_head import BaseMono3DDenseHead
@HEADS.register_module()
class AnchorFreeMono3DHead(BaseMono3DDenseHead):
"""Anchor-free head for monocular 3D object detection.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
feat_channels (int): Number of hidden channels. Used in child classes.
stacked_convs (int): Number of stacking convs of the head.
strides (tuple): Downsample factor of each feature map.
dcn_on_last_conv (bool): If true, use dcn in the last layer of
towers. Default: False.
conv_bias (bool | str): If specified as `auto`, it will be decided by
the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
None, otherwise False. Default: "auto".
background_label (int | None): Label ID of background, set as 0 for
RPN and num_classes for other heads. It will automatically set as
num_classes if None is given.
use_direction_classifier (bool): Whether to add a direction classifier.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
loss_attr (dict): Config of attribute classifier loss, which is only
active when pred_attrs=True.
bbox_code_size (int): Dimensions of predicted bounding boxes.
pred_attrs (bool): Whether to predict attributes. Default to False.
num_attrs (int): The number of attributes to be predicted. Default: 9.
pred_velo (bool): Whether to predict velocity. Default to False.
pred_bbox2d (bool): Whether to predict 2D boxes. Default to False.
group_reg_dims (tuple[int]): The dimension of each regression target
group. Default: (2, 1, 3, 1, 2).
cls_branch (tuple[int]): Channels for classification branch.
Default: (128, 64).
reg_branch (tuple[tuple]): Channels for regression branch.
Default: (
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch (tuple[int]): Channels for direction classification branch.
Default: (64, ).
attr_branch (tuple[int]): Channels for classification branch.
Default: (64, ).
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
train_cfg (dict): Training config of anchor head.
test_cfg (dict): Testing config of anchor head.
""" # noqa: W605
_version = 1
def __init__(
self,
num_classes,
in_channels,
feat_channels=256,
stacked_convs=4,
strides=(4, 8, 16, 32, 64),
dcn_on_last_conv=False,
conv_bias='auto',
background_label=None,
use_direction_classifier=True,
diff_rad_by_sin=True,
dir_offset=0,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
bbox_code_size=9, # For nuscenes
pred_attrs=False,
num_attrs=9, # For nuscenes
pred_velo=False,
pred_bbox2d=False,
group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo,
cls_branch=(128, 64),
reg_branch=(
(128, 64), # offset
(128, 64), # depth
(64, ), # size
(64, ), # rot
() # velo
),
dir_branch=(64, ),
attr_branch=(64, ),
conv_cfg=None,
norm_cfg=None,
train_cfg=None,
test_cfg=None):
super(AnchorFreeMono3DHead, self).__init__()
self.num_classes = num_classes
self.cls_out_channels = num_classes
self.in_channels = in_channels
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.strides = strides
self.dcn_on_last_conv = dcn_on_last_conv
assert conv_bias == 'auto' or isinstance(conv_bias, bool)
self.conv_bias = conv_bias
self.use_direction_classifier = use_direction_classifier
self.diff_rad_by_sin = diff_rad_by_sin
self.dir_offset = dir_offset
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_dir = build_loss(loss_dir)
self.bbox_code_size = bbox_code_size
self.group_reg_dims = list(group_reg_dims)
self.cls_branch = cls_branch
self.reg_branch = reg_branch
assert len(reg_branch) == len(group_reg_dims), 'The number of '\
'element in reg_branch and group_reg_dims should be the same.'
self.pred_velo = pred_velo
self.pred_bbox2d = pred_bbox2d
self.out_channels = []
for reg_branch_channels in reg_branch:
if len(reg_branch_channels) > 0:
self.out_channels.append(reg_branch_channels[-1])
else:
self.out_channels.append(-1)
self.dir_branch = dir_branch
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.fp16_enabled = False
self.background_label = (
num_classes if background_label is None else background_label)
# background_label should be either 0 or num_classes
assert (self.background_label == 0
or self.background_label == num_classes)
self.pred_attrs = pred_attrs
self.attr_background_label = -1
self.num_attrs = num_attrs
if self.pred_attrs:
self.attr_background_label = num_attrs
self.loss_attr = build_loss(loss_attr)
self.attr_branch = attr_branch
self._init_layers()
def _init_layers(self):
"""Initialize layers of the head."""
self._init_cls_convs()
self._init_reg_convs()
self._init_predictor()
def _init_cls_convs(self):
"""Initialize classification conv layers of the head."""
self.cls_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
if self.dcn_on_last_conv and i == self.stacked_convs - 1:
conv_cfg = dict(type='DCNv2')
else:
conv_cfg = self.conv_cfg
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
def _init_reg_convs(self):
"""Initialize bbox regression conv layers of the head."""
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
if self.dcn_on_last_conv and i == self.stacked_convs - 1:
conv_cfg = dict(type='DCNv2')
else:
conv_cfg = self.conv_cfg
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
def _init_branch(self, conv_channels=(64), conv_strides=(1)):
"""Initialize conv layers as a prediction branch."""
conv_before_pred = nn.ModuleList()
if isinstance(conv_channels, int):
conv_channels = [self.feat_channels] + [conv_channels]
conv_strides = [conv_strides]
else:
conv_channels = [self.feat_channels] + list(conv_channels)
conv_strides = list(conv_strides)
for i in range(len(conv_strides)):
conv_before_pred.append(
ConvModule(
conv_channels[i],
conv_channels[i + 1],
3,
stride=conv_strides[i],
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias))
return conv_before_pred
def _init_predictor(self):
"""Initialize predictor layers of the head."""
self.conv_cls_prev = self._init_branch(
conv_channels=self.cls_branch,
conv_strides=(1, ) * len(self.cls_branch))
self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
1)
self.conv_reg_prevs = nn.ModuleList()
self.conv_regs = nn.ModuleList()
for i in range(len(self.group_reg_dims)):
reg_dim = self.group_reg_dims[i]
reg_branch_channels = self.reg_branch[i]
out_channel = self.out_channels[i]
if len(reg_branch_channels) > 0:
self.conv_reg_prevs.append(
self._init_branch(
conv_channels=reg_branch_channels,
conv_strides=(1, ) * len(reg_branch_channels)))
self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1))
else:
self.conv_reg_prevs.append(None)
self.conv_regs.append(
nn.Conv2d(self.feat_channels, reg_dim, 1))
if self.use_direction_classifier:
self.conv_dir_cls_prev = self._init_branch(
conv_channels=self.dir_branch,
conv_strides=(1, ) * len(self.dir_branch))
self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1)
if self.pred_attrs:
self.conv_attr_prev = self._init_branch(
conv_channels=self.attr_branch,
conv_strides=(1, ) * len(self.attr_branch))
self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
def init_weights(self):
"""Initialize weights of the head."""
for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]:
for m in modules:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
for conv_reg_prev in self.conv_reg_prevs:
if conv_reg_prev is None:
continue
for m in conv_reg_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
if self.use_direction_classifier:
for m in self.conv_dir_cls_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
if self.pred_attrs:
for m in self.conv_attr_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.conv_cls, std=0.01, bias=bias_cls)
for conv_reg in self.conv_regs:
normal_init(conv_reg, std=0.01)
if self.use_direction_classifier:
normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
if self.pred_attrs:
normal_init(self.conv_attr, std=0.01, bias=bias_cls)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple: Usually contain classification scores, bbox predictions, \
and direction class predictions.
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
"""
return multi_apply(self.forward_single, feats)[:5]
def forward_single(self, x):
"""Forward features of a single scale levle.
Args:
x (Tensor): FPN feature maps of the specified stride.
Returns:
tuple: Scores for each class, bbox predictions, direction class,
and attributes, features after classification and regression
conv layers, some models needs these features like FCOS.
"""
cls_feat = x
reg_feat = x
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
# clone the cls_feat for reusing the feature map afterwards
clone_cls_feat = cls_feat.clone()
for conv_cls_prev_layer in self.conv_cls_prev:
clone_cls_feat = conv_cls_prev_layer(clone_cls_feat)
cls_score = self.conv_cls(clone_cls_feat)
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
bbox_pred = []
for i in range(len(self.group_reg_dims)):
# clone the reg_feat for reusing the feature map afterwards
clone_reg_feat = reg_feat.clone()
if len(self.reg_branch[i]) > 0:
for conv_reg_prev_layer in self.conv_reg_prevs[i]:
clone_reg_feat = conv_reg_prev_layer(clone_reg_feat)
bbox_pred.append(self.conv_regs[i](clone_reg_feat))
bbox_pred = torch.cat(bbox_pred, dim=1)
dir_cls_pred = None
if self.use_direction_classifier:
clone_reg_feat = reg_feat.clone()
for conv_dir_cls_prev_layer in self.conv_dir_cls_prev:
clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat)
dir_cls_pred = self.conv_dir_cls(clone_reg_feat)
attr_pred = None
if self.pred_attrs:
# clone the cls_feat for reusing the feature map afterwards
clone_cls_feat = cls_feat.clone()
for conv_attr_prev_layer in self.conv_attr_prev:
clone_cls_feat = conv_attr_prev_layer(clone_cls_feat)
attr_pred = self.conv_attr(clone_cls_feat)
return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \
reg_feat
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each
image with shape (num_gts, bbox_code_size).
gt_labels_3d (list[Tensor]): 3D class indices of each box.
centers2d (list[Tensor]): Projected 3D centers onto 2D images.
depths (list[Tensor]): Depth of projected centers on 2D images.
attr_labels (list[Tensor], optional): Attribute indices
corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
"""
raise NotImplementedError
@abstractmethod
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * bbox_code_size, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
"""
raise NotImplementedError
@abstractmethod
def get_targets(self, points, gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
depths_list, attr_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
image, each has shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
box, each has shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
each has shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
attr_labels_list (list[Tensor]): Attribute labels of each box,
each has shape (num_gt,).
"""
raise NotImplementedError
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points of a single scale level."""
h, w = featmap_size
x_range = torch.arange(w, dtype=dtype, device=device)
y_range = torch.arange(h, dtype=dtype, device=device)
y, x = torch.meshgrid(y_range, x_range)
if flatten:
y = y.flatten()
x = x.flatten()
return y, x
def get_points(self, featmap_sizes, dtype, device, flatten=False):
"""Get points according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
Returns:
tuple: points of each image.
"""
mlvl_points = []
for i in range(len(featmap_sizes)):
mlvl_points.append(
self._get_points_single(featmap_sizes[i], self.strides[i],
dtype, device, flatten))
return mlvl_points
from abc import ABCMeta, abstractmethod
from torch import nn as nn
class BaseMono3DDenseHead(nn.Module, metaclass=ABCMeta):
"""Base class for Monocular 3D DenseHeads."""
def __init__(self):
super(BaseMono3DDenseHead, self).__init__()
@abstractmethod
def loss(self, **kwargs):
"""Compute losses of the head."""
pass
@abstractmethod
def get_bboxes(self, **kwargs):
"""Transform network output for a batch into bbox predictions."""
pass
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
centers2d=None,
depths=None,
attr_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
"""
Args:
x (list[Tensor]): Features from FPN.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
shape (num_gts, self.bbox_code_size).
gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
shape (num_gts,).
centers2d (list[Tensor]): Projected 3D center of each box,
shape (num_gts, 2).
depths (list[Tensor]): Depth of projected 3D center of each box,
shape (num_gts,).
attr_labels (list[Tensor]): Attribute labels of each box,
shape (num_gts,).
gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
Returns:
tuple:
losses: (dict[str, Tensor]): A dictionary of loss components.
proposal_list (list[Tensor]): Proposals of each image.
"""
outs = self(x)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
attr_labels, img_metas)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels,
img_metas)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
if proposal_cfg is None:
return losses
else:
proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
return losses, proposal_list
import numpy as np
import torch
from mmcv.cnn import Scale, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
from mmdet.core import multi_apply
from mmdet.models.builder import HEADS, build_loss
from .anchor_free_mono3d_head import AnchorFreeMono3DHead
INF = 1e8
@HEADS.register_module()
class FCOSMono3DHead(AnchorFreeMono3DHead):
"""Anchor-free head used in FCOS3D.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
level points.
center_sampling (bool): If true, use center sampling. Default: True.
center_sample_radius (float): Radius of center sampling. Default: 1.5.
norm_on_bbox (bool): If true, normalize the regression targets
with FPN strides. Default: True.
centerness_on_reg (bool): If true, position centerness on the
regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
Default: True.
centerness_alpha: Parameter used to adjust the intensity attenuation
from the center to the periphery. Default: 2.5.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classification loss.
loss_attr (dict): Config of attribute classification loss.
loss_centerness (dict): Config of centerness loss.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
centerness_branch (tuple[int]): Channels for centerness branch.
Default: (64, ).
""" # noqa: E501
def __init__(self,
num_classes,
in_channels,
regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384),
(384, INF)),
center_sampling=True,
center_sample_radius=1.5,
norm_on_bbox=True,
centerness_on_reg=True,
centerness_alpha=2.5,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_attr=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
centerness_branch=(64, ),
**kwargs):
self.regress_ranges = regress_ranges
self.center_sampling = center_sampling
self.center_sample_radius = center_sample_radius
self.norm_on_bbox = norm_on_bbox
self.centerness_on_reg = centerness_on_reg
self.centerness_alpha = centerness_alpha
self.centerness_branch = centerness_branch
super().__init__(
num_classes,
in_channels,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_dir=loss_dir,
loss_attr=loss_attr,
norm_cfg=norm_cfg,
**kwargs)
self.loss_centerness = build_loss(loss_centerness)
def _init_layers(self):
"""Initialize layers of the head."""
super()._init_layers()
self.conv_centerness_prev = self._init_branch(
conv_channels=self.centerness_branch,
conv_strides=(1, ) * len(self.centerness_branch))
self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
self.scales = nn.ModuleList([
nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides
]) # only for offset, depth and size regression
def init_weights(self):
"""Initialize weights of the head."""
super().init_weights()
for m in self.conv_centerness_prev:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
normal_init(self.conv_centerness, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2).
attr_preds (list[Tensor]): Attribute scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level,
each is a 4D-tensor, the channel number is num_points * 1.
"""
return multi_apply(self.forward_single, feats, self.scales,
self.strides)
def forward_single(self, x, scale, stride):
"""Forward features of a single scale levle.
Args:
x (Tensor): FPN feature maps of the specified stride.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
stride (int): The corresponding stride for feature maps, only
used to normalize the bbox prediction when self.norm_on_bbox
is True.
Returns:
tuple: scores for each class, bbox and direction class \
predictions, centerness predictions of input feature maps.
"""
cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
super().forward_single(x)
if self.centerness_on_reg:
clone_reg_feat = reg_feat.clone()
for conv_centerness_prev_layer in self.conv_centerness_prev:
clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat)
centerness = self.conv_centerness(clone_reg_feat)
else:
clone_cls_feat = cls_feat.clone()
for conv_centerness_prev_layer in self.conv_centerness_prev:
clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
centerness = self.conv_centerness(clone_cls_feat)
# scale the bbox_pred of different level
# only apply to offset, depth and size prediction
scale_offset, scale_depth, scale_size = scale[0:3]
clone_bbox_pred = bbox_pred.clone()
bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float()
bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float()
bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float()
bbox_pred[:, 2] = bbox_pred[:, 2].exp()
bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6 # avoid size=0
assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
'has not been thoroughly tested for FCOS3D.'
if self.norm_on_bbox:
if not self.training:
# Note that this line is conducted only when testing
bbox_pred[:, :2] *= stride
return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness
@staticmethod
def add_sin_difference(boxes1, boxes2):
"""Convert the rotation difference to difference in sine function.
Args:
boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
and the 7th dimension is rotation dimension.
boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
the 7th dimension is rotation dimension.
Returns:
tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \
dimensions are changed.
"""
rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
boxes2[..., 6:7])
rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
6:7])
boxes1 = torch.cat(
[boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
dim=-1)
return boxes1, boxes2
@staticmethod
def get_direction_target(reg_targets,
dir_offset=0,
num_bins=2,
one_hot=True):
"""Encode direction to 0 ~ num_bins-1.
Args:
reg_targets (torch.Tensor): Bbox regression targets.
dir_offset (int): Direction offset.
num_bins (int): Number of bins to divide 2*PI.
one_hot (bool): Whether to encode as one hot.
Returns:
torch.Tensor: Encoded direction targets.
"""
rot_gt = reg_targets[..., 6]
offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi)
dir_cls_targets = torch.floor(offset_rot /
(2 * np.pi / num_bins)).long()
dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
if one_hot:
dir_targets = torch.zeros(
*list(dir_cls_targets.shape),
num_bins,
dtype=reg_targets.dtype,
device=dir_cls_targets.device)
dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
dir_cls_targets = dir_targets
return dir_cls_targets
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
'centernesses'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
centernesses,
gt_bboxes,
gt_labels,
gt_bboxes_3d,
gt_labels_3d,
centers2d,
depths,
attr_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_attrs.
centernesses (list[Tensor]): Centerness for each scale level, each
is a 4D-tensor, the channel number is num_points * 1.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
(num_gts, code_size).
gt_labels_3d (list[Tensor]): same as gt_labels
centers2d (list[Tensor]): 2D centers on the image with shape of
(num_gts, 2).
depths (list[Tensor]): Depth ground truth with shape of
(num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len(
attr_preds)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
self.get_targets(
all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels)
num_imgs = cls_scores[0].size(0)
# flatten cls_scores, bbox_preds, dir_cls_preds and centerness
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
for bbox_pred in bbox_preds
]
flatten_dir_cls_preds = [
dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
for dir_cls_pred in dir_cls_preds
]
flatten_centerness = [
centerness.permute(0, 2, 3, 1).reshape(-1)
for centerness in centernesses
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
flatten_centerness = torch.cat(flatten_centerness)
flatten_labels_3d = torch.cat(labels_3d)
flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
flatten_centerness_targets = torch.cat(centerness_targets)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
bg_class_ind = self.num_classes
pos_inds = ((flatten_labels_3d >= 0)
& (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
num_pos = len(pos_inds)
loss_cls = self.loss_cls(
flatten_cls_scores,
flatten_labels_3d,
avg_factor=num_pos + num_imgs) # avoid num_pos is 0
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
pos_centerness = flatten_centerness[pos_inds]
if self.pred_attrs:
flatten_attr_preds = [
attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
for attr_pred in attr_preds
]
flatten_attr_preds = torch.cat(flatten_attr_preds)
flatten_attr_targets = torch.cat(attr_targets)
pos_attr_preds = flatten_attr_preds[pos_inds]
if num_pos > 0:
pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
pos_centerness_targets = flatten_centerness_targets[pos_inds]
if self.pred_attrs:
pos_attr_targets = flatten_attr_targets[pos_inds]
bbox_weights = pos_centerness_targets.new_ones(
len(pos_centerness_targets), sum(self.group_reg_dims))
equal_weights = pos_centerness_targets.new_ones(
pos_centerness_targets.shape)
code_weight = self.train_cfg.get('code_weight', None)
if code_weight:
assert len(code_weight) == sum(self.group_reg_dims)
bbox_weights = bbox_weights * bbox_weights.new_tensor(
code_weight)
if self.use_direction_classifier:
pos_dir_cls_targets = self.get_direction_target(
pos_bbox_targets_3d, self.dir_offset, one_hot=False)
if self.diff_rad_by_sin:
pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
pos_bbox_preds, pos_bbox_targets_3d)
loss_offset = self.loss_bbox(
pos_bbox_preds[:, :2],
pos_bbox_targets_3d[:, :2],
weight=bbox_weights[:, :2],
avg_factor=equal_weights.sum())
loss_depth = self.loss_bbox(
pos_bbox_preds[:, 2],
pos_bbox_targets_3d[:, 2],
weight=bbox_weights[:, 2],
avg_factor=equal_weights.sum())
loss_size = self.loss_bbox(
pos_bbox_preds[:, 3:6],
pos_bbox_targets_3d[:, 3:6],
weight=bbox_weights[:, 3:6],
avg_factor=equal_weights.sum())
loss_rotsin = self.loss_bbox(
pos_bbox_preds[:, 6],
pos_bbox_targets_3d[:, 6],
weight=bbox_weights[:, 6],
avg_factor=equal_weights.sum())
loss_velo = None
if self.pred_velo:
loss_velo = self.loss_bbox(
pos_bbox_preds[:, 7:9],
pos_bbox_targets_3d[:, 7:9],
weight=bbox_weights[:, 7:9],
avg_factor=equal_weights.sum())
loss_centerness = self.loss_centerness(pos_centerness,
pos_centerness_targets)
# direction classification loss
loss_dir = None
# TODO: add more check for use_direction_classifier
if self.use_direction_classifier:
loss_dir = self.loss_dir(
pos_dir_cls_preds,
pos_dir_cls_targets,
equal_weights,
avg_factor=equal_weights.sum())
# attribute classification loss
loss_attr = None
if self.pred_attrs:
loss_attr = self.loss_attr(
pos_attr_preds,
pos_attr_targets,
pos_centerness_targets,
avg_factor=pos_centerness_targets.sum())
else:
# need absolute due to possible negative delta x/y
loss_offset = pos_bbox_preds[:, :2].sum()
loss_depth = pos_bbox_preds[:, 2].sum()
loss_size = pos_bbox_preds[:, 3:6].sum()
loss_rotsin = pos_bbox_preds[:, 6].sum()
loss_velo = None
if self.pred_velo:
loss_velo = pos_bbox_preds[:, 7:9].sum()
loss_centerness = pos_centerness.sum()
loss_dir = None
if self.use_direction_classifier:
loss_dir = pos_dir_cls_preds.sum()
loss_attr = None
if self.pred_attrs:
loss_attr = pos_attr_preds.sum()
loss_dict = dict(
loss_cls=loss_cls,
loss_offset=loss_offset,
loss_depth=loss_depth,
loss_size=loss_size,
loss_rotsin=loss_rotsin,
loss_centerness=loss_centerness)
if loss_velo is not None:
loss_dict['loss_velo'] = loss_velo
if loss_dir is not None:
loss_dict['loss_dir'] = loss_dir
if loss_attr is not None:
loss_dict['loss_attr'] = loss_attr
return loss_dict
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
'centernesses'))
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
centernesses,
img_metas,
cfg=None,
rescale=None):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * 4, H, W)
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on each scale level, each is a 4D-tensor,
the channel number is num_points * 2. (bin = 2)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
centernesses (list[Tensor]): Centerness for each scale level with
shape (N, num_points * 1, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
The first item is an (n, 5) tensor, where the first 4 columns \
are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
5-th column is a score between 0 and 1. The second item is a \
(n,) tensor where each item is the predicted class label of \
the corresponding box.
"""
assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
len(centernesses) == len(attr_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
if self.use_direction_classifier:
dir_cls_pred_list = [
dir_cls_preds[i][img_id].detach()
for i in range(num_levels)
]
else:
dir_cls_pred_list = [
cls_scores[i][img_id].new_full(
[2, *cls_scores[i][img_id].shape[1:]], 0).detach()
for i in range(num_levels)
]
if self.pred_attrs:
attr_pred_list = [
attr_preds[i][img_id].detach() for i in range(num_levels)
]
else:
attr_pred_list = [
cls_scores[i][img_id].new_full(
[self.num_attrs, *cls_scores[i][img_id].shape[1:]],
self.attr_background_label).detach()
for i in range(num_levels)
]
centerness_pred_list = [
centernesses[i][img_id].detach() for i in range(num_levels)
]
input_meta = img_metas[img_id]
det_bboxes = self._get_bboxes_single(
cls_score_list, bbox_pred_list, dir_cls_pred_list,
attr_pred_list, centerness_pred_list, mlvl_points, input_meta,
cfg, rescale)
result_list.append(det_bboxes)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
dir_cls_preds,
attr_preds,
centernesses,
mlvl_points,
input_meta,
cfg,
rescale=False):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for a single scale level
Has shape (num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for a single scale
level with shape (num_points * bbox_code_size, H, W).
dir_cls_preds (list[Tensor]): Box scores for direction class
predictions on a single scale level with shape \
(num_points * 2, H, W)
attr_preds (list[Tensor]): Attribute scores for each scale level
Has shape (N, num_points * num_attrs, H, W)
centernesses (list[Tensor]): Centerness for a single scale level
with shape (num_points, H, W).
mlvl_points (list[Tensor]): Box reference for a single scale level
with shape (num_total_points, 2).
input_meta (dict): Metadata of input image.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Returns:
tuples[Tensor]: Predicted 3D boxes, scores, labels and attributes.
"""
view = np.array(input_meta['cam_intrinsic'])
scale_factor = input_meta['scale_factor']
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_centers2d = []
mlvl_bboxes = []
mlvl_scores = []
mlvl_dir_scores = []
mlvl_attr_scores = []
mlvl_centerness = []
for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
points in zip(cls_scores, bbox_preds, dir_cls_preds,
attr_preds, centernesses, mlvl_points):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
attr_score = torch.max(attr_pred, dim=-1)[1]
centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
bbox_pred = bbox_pred.permute(1, 2,
0).reshape(-1,
sum(self.group_reg_dims))
bbox_pred = bbox_pred[:, :self.bbox_code_size]
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
max_scores, _ = (scores * centerness[:, None]).max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
dir_cls_pred = dir_cls_pred[topk_inds, :]
centerness = centerness[topk_inds]
dir_cls_score = dir_cls_score[topk_inds]
attr_score = attr_score[topk_inds]
# change the offset to actual center predictions
bbox_pred[:, :2] = points - bbox_pred[:, :2]
if rescale:
bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor)
pred_center2d = bbox_pred[:, :3].clone()
bbox_pred[:, :3] = self.pts2Dto3D(bbox_pred[:, :3], view)
mlvl_centers2d.append(pred_center2d)
mlvl_bboxes.append(bbox_pred)
mlvl_scores.append(scores)
mlvl_dir_scores.append(dir_cls_score)
mlvl_attr_scores.append(attr_score)
mlvl_centerness.append(centerness)
mlvl_centers2d = torch.cat(mlvl_centers2d)
mlvl_bboxes = torch.cat(mlvl_bboxes)
mlvl_dir_scores = torch.cat(mlvl_dir_scores)
# change local yaw to global yaw for 3D nms
if mlvl_bboxes.shape[0] > 0:
dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0,
np.pi)
mlvl_bboxes[..., 6] = (
dir_rot + self.dir_offset +
np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype))
cam_intrinsic = mlvl_centers2d.new_zeros((4, 4))
cam_intrinsic[:view.shape[0], :view.shape[1]] = \
mlvl_centers2d.new_tensor(view)
mlvl_bboxes[:, 6] = torch.atan2(
mlvl_centers2d[:, 0] - cam_intrinsic[0, 2],
cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6]
mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
mlvl_bboxes, box_dim=self.bbox_code_size,
origin=(0.5, 0.5, 0.5)).bev)
mlvl_scores = torch.cat(mlvl_scores)
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
mlvl_attr_scores = torch.cat(mlvl_attr_scores)
mlvl_centerness = torch.cat(mlvl_centerness)
# no scale_factors in box3d_multiclass_nms
# Then we multiply it from outside
mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_nms_scores, cfg.score_thr,
cfg.max_per_img, cfg, mlvl_dir_scores,
mlvl_attr_scores)
bboxes, scores, labels, dir_scores, attrs = results
attrs = attrs.to(labels.dtype) # change data type to int
bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.bbox_code_size)
# Note that the predictions use origin (0.5, 0.5, 0.5)
# Due to the ground truth centers2d are the gravity center of objects
# The center has been transformed when computing bev bbox!!!!
if not self.pred_attrs:
attrs = None
return bboxes, scores, labels, attrs
@staticmethod
def pts2Dto3D(points, view):
"""
Args:
points (torch.Tensor): points in 2D images, [N, 3], \
3 corresponds with x, y in the image and depth.
view (np.ndarray): camera instrinsic, [3, 3]
Returns:
torch.Tensor: points in 3D space. [N, 3], \
3 corresponds with x, y, z in 3D space.
"""
assert view.shape[0] <= 4
assert view.shape[1] <= 4
assert points.shape[1] == 3
points2D = points[:, :2]
depths = points[:, 2].view(-1, 1)
unnorm_points2D = torch.cat([points2D * depths, depths], dim=1)
viewpad = torch.eye(4, dtype=points2D.dtype, device=points2D.device)
viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view)
inv_viewpad = torch.inverse(viewpad).transpose(0, 1)
# Do operation in homogenous coordinates.
nbr_points = unnorm_points2D.shape[0]
homo_points2D = torch.cat(
[unnorm_points2D,
points2D.new_ones((nbr_points, 1))], dim=1)
points3D = torch.mm(homo_points2D, inv_viewpad)[:, :3]
return points3D
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points according to feature map sizes."""
y, x = super()._get_points_single(featmap_size, stride, dtype, device)
points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
dim=-1) + stride // 2
return points
def get_targets(self, points, gt_bboxes_list, gt_labels_list,
gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
depths_list, attr_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
image, each has shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
box, each has shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
each has shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
attr_labels_list (list[Tensor]): Attribute labels of each box,
each has shape (num_gt,).
Returns:
tuple:
concat_lvl_labels (list[Tensor]): Labels of each level. \
concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
level.
"""
assert len(points) == len(self.regress_ranges)
num_levels = len(points)
# expand regress ranges to align with points
expanded_regress_ranges = [
points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
points[i]) for i in range(num_levels)
]
# concat all levels points and regress ranges
concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
concat_points = torch.cat(points, dim=0)
# the number of points per img, per lvl
num_points = [center.size(0) for center in points]
if attr_labels_list is None:
attr_labels_list = [
gt_labels.new_full(gt_labels.shape, self.attr_background_label)
for gt_labels in gt_labels_list
]
# get labels and bbox_targets of each image
_, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \
attr_targets_list = multi_apply(
self._get_target_single,
gt_bboxes_list,
gt_labels_list,
gt_bboxes_3d_list,
gt_labels_3d_list,
centers2d_list,
depths_list,
attr_labels_list,
points=concat_points,
regress_ranges=concat_regress_ranges,
num_points_per_lvl=num_points)
# split to per img, per level
labels_3d_list = [
labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
]
bbox_targets_3d_list = [
bbox_targets_3d.split(num_points, 0)
for bbox_targets_3d in bbox_targets_3d_list
]
centerness_targets_list = [
centerness_targets.split(num_points, 0)
for centerness_targets in centerness_targets_list
]
attr_targets_list = [
attr_targets.split(num_points, 0)
for attr_targets in attr_targets_list
]
# concat per level image
concat_lvl_labels_3d = []
concat_lvl_bbox_targets_3d = []
concat_lvl_centerness_targets = []
concat_lvl_attr_targets = []
for i in range(num_levels):
concat_lvl_labels_3d.append(
torch.cat([labels[i] for labels in labels_3d_list]))
concat_lvl_centerness_targets.append(
torch.cat([
centerness_targets[i]
for centerness_targets in centerness_targets_list
]))
bbox_targets_3d = torch.cat([
bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
])
concat_lvl_attr_targets.append(
torch.cat(
[attr_targets[i] for attr_targets in attr_targets_list]))
if self.norm_on_bbox:
bbox_targets_3d[:, :
2] = bbox_targets_3d[:, :2] / self.strides[i]
concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
concat_lvl_centerness_targets, concat_lvl_attr_targets
def _get_target_single(self, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels,
points, regress_ranges, num_points_per_lvl):
"""Compute regression and classification targets for a single image."""
num_points = points.size(0)
num_gts = gt_labels.size(0)
if not isinstance(gt_bboxes_3d, torch.Tensor):
gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device)
if num_gts == 0:
return gt_labels.new_full((num_points,), self.background_label), \
gt_bboxes.new_zeros((num_points, 4)), \
gt_labels_3d.new_full(
(num_points,), self.background_label), \
gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \
gt_bboxes_3d.new_zeros((num_points,)), \
attr_labels.new_full(
(num_points,), self.attr_background_label)
areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
gt_bboxes[:, 3] - gt_bboxes[:, 1])
areas = areas[None].repeat(num_points, 1)
regress_ranges = regress_ranges[:, None, :].expand(
num_points, num_gts, 2)
gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
centers2d = centers2d[None].expand(num_points, num_gts, 2)
gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts,
self.bbox_code_size)
depths = depths[None, :, None].expand(num_points, num_gts, 1)
xs, ys = points[:, 0], points[:, 1]
xs = xs[:, None].expand(num_points, num_gts)
ys = ys[:, None].expand(num_points, num_gts)
delta_xs = (xs - centers2d[..., 0])[..., None]
delta_ys = (ys - centers2d[..., 1])[..., None]
bbox_targets_3d = torch.cat(
(delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1)
left = xs - gt_bboxes[..., 0]
right = gt_bboxes[..., 2] - xs
top = ys - gt_bboxes[..., 1]
bottom = gt_bboxes[..., 3] - ys
bbox_targets = torch.stack((left, top, right, bottom), -1)
assert self.center_sampling is True, 'Setting center_sampling to '\
'False has not been implemented for FCOS3D.'
# condition1: inside a `center bbox`
radius = self.center_sample_radius
center_xs = centers2d[..., 0]
center_ys = centers2d[..., 1]
center_gts = torch.zeros_like(gt_bboxes)
stride = center_xs.new_zeros(center_xs.shape)
# project the points on current lvl back to the `original` sizes
lvl_begin = 0
for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
lvl_end = lvl_begin + num_points_lvl
stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
lvl_begin = lvl_end
center_gts[..., 0] = center_xs - stride
center_gts[..., 1] = center_ys - stride
center_gts[..., 2] = center_xs + stride
center_gts[..., 3] = center_ys + stride
cb_dist_left = xs - center_gts[..., 0]
cb_dist_right = center_gts[..., 2] - xs
cb_dist_top = ys - center_gts[..., 1]
cb_dist_bottom = center_gts[..., 3] - ys
center_bbox = torch.stack(
(cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
# condition2: limit the regression range for each location
max_regress_distance = bbox_targets.max(-1)[0]
inside_regress_range = (
(max_regress_distance >= regress_ranges[..., 0])
& (max_regress_distance <= regress_ranges[..., 1]))
# center-based criterion to deal with ambiguity
dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1))
dists[inside_gt_bbox_mask == 0] = INF
dists[inside_regress_range == 0] = INF
min_dist, min_dist_inds = dists.min(dim=1)
labels = gt_labels[min_dist_inds]
labels_3d = gt_labels_3d[min_dist_inds]
attr_labels = attr_labels[min_dist_inds]
labels[min_dist == INF] = self.background_label # set as BG
labels_3d[min_dist == INF] = self.background_label # set as BG
attr_labels[min_dist == INF] = self.attr_background_label
bbox_targets = bbox_targets[range(num_points), min_dist_inds]
bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds]
relative_dists = torch.sqrt(
torch.sum(bbox_targets_3d[..., :2]**2,
dim=-1)) / (1.414 * stride[:, 0])
# [N, 1] / [N, 1]
centerness_targets = torch.exp(-self.centerness_alpha * relative_dists)
return labels, bbox_targets, labels_3d, bbox_targets_3d, \
centerness_targets, attr_labels
......@@ -5,7 +5,8 @@ import random
import torch
from os.path import dirname, exists, join
from mmdet3d.core.bbox import DepthInstance3DBoxes, LiDARInstance3DBoxes
from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
LiDARInstance3DBoxes)
from mmdet3d.models.builder import build_detector
......@@ -316,3 +317,56 @@ def test_centerpoint():
assert boxes_3d_0.tensor.shape[1] == 9
assert scores_3d_0.shape[0] >= 0
assert labels_3d_0.shape[0] >= 0
def test_fcos3d():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
fcos3d_cfg = _get_detector_cfg(
'fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py')
self = build_detector(fcos3d_cfg).cuda()
imgs = torch.rand([1, 3, 928, 1600], dtype=torch.float32).cuda()
gt_bboxes = [torch.rand([3, 4], dtype=torch.float32).cuda()]
gt_bboxes_3d = CameraInstance3DBoxes(
torch.rand([3, 9], device='cuda'), box_dim=9)
gt_labels = [torch.randint(0, 10, [3], device='cuda')]
gt_labels_3d = gt_labels
centers2d = [torch.rand([3, 2], dtype=torch.float32).cuda()]
depths = [torch.rand([3], dtype=torch.float32).cuda()]
attr_labels = [torch.randint(0, 9, [3], device='cuda')]
img_metas = [
dict(
cam_intrinsic=[[1260.8474446004698, 0.0, 807.968244525554],
[0.0, 1260.8474446004698, 495.3344268742088],
[0.0, 0.0, 1.0]],
scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
box_type_3d=CameraInstance3DBoxes)
]
# test forward_train
losses = self.forward_train(imgs, img_metas, gt_bboxes, gt_labels,
gt_bboxes_3d, gt_labels_3d, centers2d, depths,
attr_labels)
assert losses['loss_cls'] >= 0
assert losses['loss_offset'] >= 0
assert losses['loss_depth'] >= 0
assert losses['loss_size'] >= 0
assert losses['loss_rotsin'] >= 0
assert losses['loss_centerness'] >= 0
assert losses['loss_velo'] >= 0
assert losses['loss_dir'] >= 0
assert losses['loss_attr'] >= 0
# test simple_test
results = self.simple_test(imgs, img_metas)
boxes_3d = results[0]['img_bbox']['boxes_3d']
scores_3d = results[0]['img_bbox']['scores_3d']
labels_3d = results[0]['img_bbox']['labels_3d']
attrs_3d = results[0]['img_bbox']['attrs_3d']
assert boxes_3d.tensor.shape[0] >= 0
assert boxes_3d.tensor.shape[1] == 9
assert scores_3d.shape[0] >= 0
assert labels_3d.shape[0] >= 0
assert attrs_3d.shape[0] >= 0
......@@ -5,8 +5,8 @@ import random
import torch
from os.path import dirname, exists, join
from mmdet3d.core.bbox import (Box3DMode, DepthInstance3DBoxes,
LiDARInstance3DBoxes)
from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes,
DepthInstance3DBoxes, LiDARInstance3DBoxes)
from mmdet3d.models.builder import build_head
from mmdet.apis import set_random_seed
......@@ -1044,3 +1044,73 @@ def test_shape_aware_head_getboxes():
input_metas)
assert len(result_list[0][1]) > 0 # ensure not all boxes are filtered
assert (result_list[0][1] > 0.3).all()
def test_fcos_mono3d_head():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
_setup_seed(0)
fcos3d_head_cfg = _get_head_cfg(
'fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py')
self = build_head(fcos3d_head_cfg).cuda()
feats = [
torch.rand([2, 256, 116, 200], dtype=torch.float32).cuda(),
torch.rand([2, 256, 58, 100], dtype=torch.float32).cuda(),
torch.rand([2, 256, 29, 50], dtype=torch.float32).cuda(),
torch.rand([2, 256, 15, 25], dtype=torch.float32).cuda(),
torch.rand([2, 256, 8, 13], dtype=torch.float32).cuda()
]
# test forward
ret_dict = self(feats)
assert len(ret_dict) == 5
assert len(ret_dict[0]) == 5
assert ret_dict[0][0].shape == torch.Size([2, 10, 116, 200])
# test loss
gt_bboxes = [
torch.rand([3, 4], dtype=torch.float32).cuda(),
torch.rand([3, 4], dtype=torch.float32).cuda()
]
gt_bboxes_3d = CameraInstance3DBoxes(
torch.rand([3, 9], device='cuda'), box_dim=9)
gt_labels = [torch.randint(0, 10, [3], device='cuda') for i in range(2)]
gt_labels_3d = gt_labels
centers2d = [
torch.rand([3, 2], dtype=torch.float32).cuda(),
torch.rand([3, 2], dtype=torch.float32).cuda()
]
depths = [
torch.rand([3], dtype=torch.float32).cuda(),
torch.rand([3], dtype=torch.float32).cuda()
]
attr_labels = [torch.randint(0, 9, [3], device='cuda') for i in range(2)]
img_metas = [
dict(
cam_intrinsic=[[1260.8474446004698, 0.0, 807.968244525554],
[0.0, 1260.8474446004698, 495.3344268742088],
[0.0, 0.0, 1.0]],
scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
box_type_3d=CameraInstance3DBoxes) for i in range(2)
]
losses = self.loss(*ret_dict, gt_bboxes, gt_labels, gt_bboxes_3d,
gt_labels_3d, centers2d, depths, attr_labels, img_metas)
assert losses['loss_cls'] >= 0
assert losses['loss_offset'] >= 0
assert losses['loss_depth'] >= 0
assert losses['loss_size'] >= 0
assert losses['loss_rotsin'] >= 0
assert losses['loss_centerness'] >= 0
assert losses['loss_velo'] >= 0
assert losses['loss_dir'] >= 0
assert losses['loss_attr'] >= 0
# test get_boxes
results = self.get_bboxes(*ret_dict, img_metas)
assert len(results) == 2
assert len(results[0]) == 4
assert results[0][0].tensor.shape == torch.Size([200, 9])
assert results[0][1].shape == torch.Size([200])
assert results[0][2].shape == torch.Size([200])
assert results[0][3].shape == torch.Size([200])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment