Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
8538177b
"torchvision/models/vscode:/vscode.git/clone" did not exist on "7c95f97a01a0a5f6f434d6b72b18673cace1fbd2"
Unverified
Commit
8538177b
authored
Jan 21, 2022
by
ChaimZhu
Committed by
GitHub
Jan 21, 2022
Browse files
[Feature] Add MonoFlex Head (#1044)
parent
4590418e
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1093 additions
and
26 deletions
+1093
-26
mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
+13
-13
mmdet3d/core/bbox/structures/utils.py
mmdet3d/core/bbox/structures/utils.py
+7
-4
mmdet3d/core/utils/__init__.py
mmdet3d/core/utils/__init__.py
+4
-2
mmdet3d/core/utils/gaussian.py
mmdet3d/core/utils/gaussian.py
+72
-0
mmdet3d/models/dense_heads/__init__.py
mmdet3d/models/dense_heads/__init__.py
+3
-1
mmdet3d/models/dense_heads/monoflex_head.py
mmdet3d/models/dense_heads/monoflex_head.py
+770
-0
mmdet3d/models/model_utils/__init__.py
mmdet3d/models/model_utils/__init__.py
+2
-1
mmdet3d/models/model_utils/edge_fusion_module.py
mmdet3d/models/model_utils/edge_fusion_module.py
+77
-0
mmdet3d/models/utils/edge_indices.py
mmdet3d/models/utils/edge_indices.py
+11
-1
tests/test_models/test_heads/test_heads.py
tests/test_models/test_heads/test_heads.py
+59
-0
tests/test_utils/test_bbox_coders.py
tests/test_utils/test_bbox_coders.py
+67
-0
tests/test_utils/test_utils.py
tests/test_utils/test_utils.py
+8
-4
No files found.
mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
View file @
8538177b
...
...
@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder):
torch.Tensor: Targets of orientations.
"""
local_yaw
=
gt_bboxes_3d
.
local_yaw
# encode local yaw (-pi ~ pi) to multibin format
encode_local_yaw
=
np
.
zeros
(
self
.
num_dir_bins
*
2
)
encode_local_yaw
=
local_yaw
.
new_zeros
(
[
local_yaw
.
shape
[
0
],
self
.
num_dir_bins
*
2
])
bin_size
=
2
*
np
.
pi
/
self
.
num_dir_bins
margin_size
=
bin_size
*
self
.
bin_margin
bin_centers
=
self
.
bin_centers
bin_centers
=
local_yaw
.
new_tensor
(
self
.
bin_centers
)
range_size
=
bin_size
/
2
+
margin_size
offsets
=
local_yaw
-
bin_centers
.
unsqueeze
(
0
)
offsets
=
local_yaw
.
unsqueeze
(
1
)
-
bin_centers
.
unsqueeze
(
0
)
offsets
[
offsets
>
np
.
pi
]
=
offsets
[
offsets
>
np
.
pi
]
-
2
*
np
.
pi
offsets
[
offsets
<
-
np
.
pi
]
=
offsets
[
offsets
<
-
np
.
pi
]
+
2
*
np
.
pi
...
...
@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder):
offset
=
offsets
[:,
i
]
inds
=
abs
(
offset
)
<
range_size
encode_local_yaw
[
inds
,
i
]
=
1
encode_local_yaw
[
inds
,
i
+
self
.
num_dir_bins
]
=
offset
encode_local_yaw
[
inds
,
i
+
self
.
num_dir_bins
]
=
offset
[
inds
]
orientation_target
=
encode_local_yaw
...
...
@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder):
pred_direct_depth_uncertainty
=
bbox
[:,
49
:
50
].
squeeze
(
-
1
)
# 2 dimension of offsets x keypoints (8 corners + top/bottom center)
pred_keypoints2d
=
bbox
[:,
6
:
26
]
pred_keypoints2d
=
bbox
[:,
6
:
26
]
.
reshape
(
-
1
,
10
,
2
)
# 1 dimension for depth offsets
pred_direct_depth_offsets
=
bbox
[:,
48
:
49
].
squeeze
(
-
1
)
...
...
@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder):
raise
NotImplementedError
# (N, 3)
centers2d_img
=
\
torch
.
cat
(
centers2d_img
,
depths
.
unsqueeze
(
-
1
),
dim
=
1
)
torch
.
cat
(
(
centers2d_img
,
depths
.
unsqueeze
(
-
1
)
)
,
dim
=
1
)
# (N, 4, 1)
centers2d_extend
=
\
torch
.
cat
((
centers2d_img
,
centers2d_img
.
new_ones
(
N
,
1
)),
dim
=
1
).
unqueeze
(
-
1
)
dim
=
1
).
un
s
queeze
(
-
1
)
locations
=
torch
.
matmul
(
cam2imgs_inv
,
centers2d_extend
).
squeeze
(
-
1
)
return
locations
[:,
:
3
]
...
...
@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder):
local_yaws
=
orientations
yaws
=
local_yaws
+
rays
larger_idx
=
(
yaws
>
np
.
pi
).
nonzero
()
small_idx
=
(
yaws
<
-
np
.
pi
).
nonzero
()
larger_idx
=
(
yaws
>
np
.
pi
).
nonzero
(
as_tuple
=
False
)
small_idx
=
(
yaws
<
-
np
.
pi
).
nonzero
(
as_tuple
=
False
)
if
len
(
larger_idx
)
!=
0
:
yaws
[
larger_idx
]
-=
2
*
np
.
pi
if
len
(
small_idx
)
!=
0
:
yaws
[
small_idx
]
+=
2
*
np
.
pi
larger_idx
=
(
local_yaws
>
np
.
pi
).
nonzero
()
small_idx
=
(
local_yaws
<
-
np
.
pi
).
nonzero
()
larger_idx
=
(
local_yaws
>
np
.
pi
).
nonzero
(
as_tuple
=
False
)
small_idx
=
(
local_yaws
<
-
np
.
pi
).
nonzero
(
as_tuple
=
False
)
if
len
(
larger_idx
)
!=
0
:
local_yaws
[
larger_idx
]
-=
2
*
np
.
pi
if
len
(
small_idx
)
!=
0
:
...
...
@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder):
return
bboxes2d
def
combine_depths
(
depth
,
depth_uncertainty
):
def
combine_depths
(
self
,
depth
,
depth_uncertainty
):
"""Combine all the prediced depths with depth uncertainty.
Args:
...
...
mmdet3d/core/bbox/structures/utils.py
View file @
8538177b
...
...
@@ -324,8 +324,11 @@ def yaw2local(yaw, loc):
torch.Tensor: local yaw (alpha in kitti).
"""
local_yaw
=
yaw
-
torch
.
atan2
(
loc
[:,
0
],
loc
[:,
2
])
while
local_yaw
>
np
.
pi
:
local_yaw
-=
np
.
pi
*
2
while
local_yaw
<
-
np
.
pi
:
local_yaw
+=
np
.
pi
*
2
larger_idx
=
(
local_yaw
>
np
.
pi
).
nonzero
(
as_tuple
=
False
)
small_idx
=
(
local_yaw
<
-
np
.
pi
).
nonzero
(
as_tuple
=
False
)
if
len
(
larger_idx
)
!=
0
:
local_yaw
[
larger_idx
]
-=
2
*
np
.
pi
if
len
(
small_idx
)
!=
0
:
local_yaw
[
small_idx
]
+=
2
*
np
.
pi
return
local_yaw
mmdet3d/core/utils/__init__.py
View file @
8538177b
# Copyright (c) OpenMMLab. All rights reserved.
from
.array_converter
import
ArrayConverter
,
array_converter
from
.gaussian
import
draw_heatmap_gaussian
,
gaussian_2d
,
gaussian_radius
from
.gaussian
import
(
draw_heatmap_gaussian
,
ellip_gaussian2D
,
gaussian_2d
,
gaussian_radius
,
get_ellip_gaussian_2D
)
__all__
=
[
'gaussian_2d'
,
'gaussian_radius'
,
'draw_heatmap_gaussian'
,
'ArrayConverter'
,
'array_converter'
'ArrayConverter'
,
'array_converter'
,
'ellip_gaussian2D'
,
'get_ellip_gaussian_2D'
]
mmdet3d/core/utils/gaussian.py
View file @
8538177b
...
...
@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5):
sq3
=
torch
.
sqrt
(
b3
**
2
-
4
*
a3
*
c3
)
r3
=
(
b3
+
sq3
)
/
2
return
min
(
r1
,
r2
,
r3
)
def
get_ellip_gaussian_2D
(
heatmap
,
center
,
radius_x
,
radius_y
,
k
=
1
):
"""Generate 2D ellipse gaussian heatmap.
Args:
heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
it and maintain the max value.
center (list[int]): Coord of gaussian kernel's center.
radius_x (int): X-axis radius of gaussian kernel.
radius_y (int): Y-axis radius of gaussian kernel.
k (int, optional): Coefficient of gaussian kernel. Default: 1.
Returns:
out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
"""
diameter_x
,
diameter_y
=
2
*
radius_x
+
1
,
2
*
radius_y
+
1
gaussian_kernel
=
ellip_gaussian2D
((
radius_x
,
radius_y
),
sigma_x
=
diameter_x
/
6
,
sigma_y
=
diameter_y
/
6
,
dtype
=
heatmap
.
dtype
,
device
=
heatmap
.
device
)
x
,
y
=
int
(
center
[
0
]),
int
(
center
[
1
])
height
,
width
=
heatmap
.
shape
[
0
:
2
]
left
,
right
=
min
(
x
,
radius_x
),
min
(
width
-
x
,
radius_x
+
1
)
top
,
bottom
=
min
(
y
,
radius_y
),
min
(
height
-
y
,
radius_y
+
1
)
masked_heatmap
=
heatmap
[
y
-
top
:
y
+
bottom
,
x
-
left
:
x
+
right
]
masked_gaussian
=
gaussian_kernel
[
radius_y
-
top
:
radius_y
+
bottom
,
radius_x
-
left
:
radius_x
+
right
]
out_heatmap
=
heatmap
torch
.
max
(
masked_heatmap
,
masked_gaussian
*
k
,
out
=
out_heatmap
[
y
-
top
:
y
+
bottom
,
x
-
left
:
x
+
right
])
return
out_heatmap
def
ellip_gaussian2D
(
radius
,
sigma_x
,
sigma_y
,
dtype
=
torch
.
float32
,
device
=
'cpu'
):
"""Generate 2D ellipse gaussian kernel.
Args:
radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
kernel.
sigma_x (int): X-axis sigma of gaussian function.
sigma_y (int): Y-axis sigma of gaussian function.
dtype (torch.dtype, optional): Dtype of gaussian tensor.
Default: torch.float32.
device (str, optional): Device of gaussian tensor.
Default: 'cpu'.
Returns:
h (Tensor): Gaussian kernel with a
``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
"""
x
=
torch
.
arange
(
-
radius
[
0
],
radius
[
0
]
+
1
,
dtype
=
dtype
,
device
=
device
).
view
(
1
,
-
1
)
y
=
torch
.
arange
(
-
radius
[
1
],
radius
[
1
]
+
1
,
dtype
=
dtype
,
device
=
device
).
view
(
-
1
,
1
)
h
=
(
-
(
x
*
x
)
/
(
2
*
sigma_x
*
sigma_x
)
-
(
y
*
y
)
/
(
2
*
sigma_y
*
sigma_y
)).
exp
()
h
[
h
<
torch
.
finfo
(
h
.
dtype
).
eps
*
h
.
max
()]
=
0
return
h
mmdet3d/models/dense_heads/__init__.py
View file @
8538177b
...
...
@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead
from
.fcos_mono3d_head
import
FCOSMono3DHead
from
.free_anchor3d_head
import
FreeAnchor3DHead
from
.groupfree3d_head
import
GroupFree3DHead
from
.monoflex_head
import
MonoFlexHead
from
.parta2_rpn_head
import
PartA2RPNHead
from
.pgd_head
import
PGDHead
from
.point_rpn_head
import
PointRPNHead
...
...
@@ -19,5 +20,6 @@ __all__ = [
'Anchor3DHead'
,
'FreeAnchor3DHead'
,
'PartA2RPNHead'
,
'VoteHead'
,
'SSD3DHead'
,
'BaseConvBboxHead'
,
'CenterHead'
,
'ShapeAwareHead'
,
'BaseMono3DDenseHead'
,
'AnchorFreeMono3DHead'
,
'FCOSMono3DHead'
,
'GroupFree3DHead'
,
'PointRPNHead'
,
'SMOKEMono3DHead'
,
'PGDHead'
'GroupFree3DHead'
,
'PointRPNHead'
,
'SMOKEMono3DHead'
,
'PGDHead'
,
'MonoFlexHead'
]
mmdet3d/models/dense_heads/monoflex_head.py
0 → 100644
View file @
8538177b
import
torch
from
mmcv.cnn
import
xavier_init
from
torch
import
nn
as
nn
from
mmdet3d.core.utils
import
get_ellip_gaussian_2D
from
mmdet3d.models.model_utils
import
EdgeFusionModule
from
mmdet3d.models.utils
import
(
filter_outside_objs
,
get_edge_indices
,
get_keypoints
,
handle_proj_objs
)
from
mmdet.core
import
multi_apply
from
mmdet.core.bbox.builder
import
build_bbox_coder
from
mmdet.models.builder
import
HEADS
,
build_loss
from
mmdet.models.utils
import
gaussian_radius
,
gen_gaussian_target
from
mmdet.models.utils.gaussian_target
import
(
get_local_maximum
,
get_topk_from_heatmap
,
transpose_and_gather_feat
)
from
.anchor_free_mono3d_head
import
AnchorFreeMono3DHead
@
HEADS
.
register_module
()
class
MonoFlexHead
(
AnchorFreeMono3DHead
):
r
"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
.. code-block:: none
/ --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
|
| --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
|
| --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty
feature
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty
|
| --> 3 x 3 conv --> 1 x 1 conv --> 3d dimensions
|
| |--- 1 x 1 conv --> ori cls
| --> 3 x 3 conv --|
| |--- 1 x 1 conv --> ori offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> depth
|
\ --> 3 x 3 conv --> 1 x 1 conv --> depth uncertainty
Args:
use_edge_fusion (bool): Whether to use edge fusion module while
feature extraction.
edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
edge_heatmap_ratio (float): Ratio of generating target heatmap.
filter_outside_objs (bool, optional): Whether to filter the
outside objects. Default: True.
loss_cls (dict, optional): Config of classification loss.
Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
loss_bbox (dict, optional): Config of localization loss.
Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
loss_dir (dict, optional): Config of direction classification loss.
Default: dict(type='MultibinLoss', loss_weight=0.1).
loss_keypoints (dict, optional): Config of keypoints loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_dims: (dict, optional): Config of dimensions loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_offsets2d: (dict, optional): Config of offsets2d loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_direct_depth: (dict, optional): Config of directly regression depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_combined_depth: (dict, optional): Config of combined depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_attr (dict, optional): Config of attribute classification loss.
In MonoFlex, Default: None.
bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
Default: dict(type='MonoFlexCoder', code_size=7).
norm_cfg (dict, optional): Dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
init_cfg (dict): Initialization config dict. Default: None.
"""
# noqa: E501
def
__init__
(
self
,
num_classes
,
in_channels
,
use_edge_fusion
,
edge_fusion_inds
,
edge_heatmap_ratio
,
filter_outside_objs
=
True
,
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'IoULoss'
,
loss_weight
=
0.1
),
loss_dir
=
dict
(
type
=
'MultiBinLoss'
,
loss_weight
=
0.1
),
loss_keypoints
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_dims
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_offsets2d
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_direct_depth
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_keypoints_depth
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_combined_depth
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_attr
=
None
,
bbox_coder
=
dict
(
type
=
'MonoFlexCoder'
,
code_size
=
7
),
norm_cfg
=
dict
(
type
=
'BN'
),
init_cfg
=
None
,
init_bias
=-
2.19
,
**
kwargs
):
self
.
use_edge_fusion
=
use_edge_fusion
self
.
edge_fusion_inds
=
edge_fusion_inds
super
().
__init__
(
num_classes
,
in_channels
,
loss_cls
=
loss_cls
,
loss_bbox
=
loss_bbox
,
loss_dir
=
loss_dir
,
loss_attr
=
loss_attr
,
norm_cfg
=
norm_cfg
,
init_cfg
=
init_cfg
,
**
kwargs
)
self
.
filter_outside_objs
=
filter_outside_objs
self
.
edge_heatmap_ratio
=
edge_heatmap_ratio
self
.
init_bias
=
init_bias
self
.
loss_dir
=
build_loss
(
loss_dir
)
self
.
loss_keypoints
=
build_loss
(
loss_keypoints
)
self
.
loss_dims
=
build_loss
(
loss_dims
)
self
.
loss_offsets2d
=
build_loss
(
loss_offsets2d
)
self
.
loss_direct_depth
=
build_loss
(
loss_direct_depth
)
self
.
loss_keypoints_depth
=
build_loss
(
loss_keypoints_depth
)
self
.
loss_combined_depth
=
build_loss
(
loss_combined_depth
)
self
.
bbox_coder
=
build_bbox_coder
(
bbox_coder
)
def
_init_edge_module
(
self
):
"""Initialize edge fusion module for feature extraction."""
self
.
edge_fuse_cls
=
EdgeFusionModule
(
self
.
num_classes
,
256
)
for
i
in
range
(
len
(
self
.
edge_fusion_inds
)):
reg_inds
,
out_inds
=
self
.
edge_fusion_inds
[
i
]
out_channels
=
self
.
group_reg_dims
[
reg_inds
][
out_inds
]
fusion_layer
=
EdgeFusionModule
(
out_channels
,
256
)
layer_name
=
f
'edge_fuse_reg_
{
reg_inds
}
_
{
out_inds
}
'
self
.
add_module
(
layer_name
,
fusion_layer
)
def
init_weights
(
self
):
"""Initialize weights."""
super
().
init_weights
()
self
.
conv_cls
.
bias
.
data
.
fill_
(
self
.
init_bias
)
xavier_init
(
self
.
conv_regs
[
4
][
0
],
gain
=
0.01
)
xavier_init
(
self
.
conv_regs
[
7
][
0
],
gain
=
0.01
)
for
m
in
self
.
conv_regs
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
if
m
.
bias
is
not
None
:
nn
.
init
.
constant_
(
m
.
bias
,
0
)
def
_init_predictor
(
self
):
"""Initialize predictor layers of the head."""
self
.
conv_cls_prev
=
self
.
_init_branch
(
conv_channels
=
self
.
cls_branch
,
conv_strides
=
(
1
,
)
*
len
(
self
.
cls_branch
))
self
.
conv_cls
=
nn
.
Conv2d
(
self
.
cls_branch
[
-
1
],
self
.
cls_out_channels
,
1
)
# init regression head
self
.
conv_reg_prevs
=
nn
.
ModuleList
()
# init output head
self
.
conv_regs
=
nn
.
ModuleList
()
# group_reg_dims:
# ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
for
i
in
range
(
len
(
self
.
group_reg_dims
)):
reg_dims
=
self
.
group_reg_dims
[
i
]
reg_branch_channels
=
self
.
reg_branch
[
i
]
out_channel
=
self
.
out_channels
[
i
]
reg_list
=
nn
.
ModuleList
()
if
len
(
reg_branch_channels
)
>
0
:
self
.
conv_reg_prevs
.
append
(
self
.
_init_branch
(
conv_channels
=
reg_branch_channels
,
conv_strides
=
(
1
,
)
*
len
(
reg_branch_channels
)))
for
reg_dim
in
reg_dims
:
reg_list
.
append
(
nn
.
Conv2d
(
out_channel
,
reg_dim
,
1
))
self
.
conv_regs
.
append
(
reg_list
)
else
:
self
.
conv_reg_prevs
.
append
(
None
)
for
reg_dim
in
reg_dims
:
reg_list
.
append
(
nn
.
Conv2d
(
self
.
feat_channels
,
reg_dim
,
1
))
self
.
conv_regs
.
append
(
reg_list
)
def
_init_layers
(
self
):
"""Initialize layers of the head."""
self
.
_init_predictor
()
if
self
.
use_edge_fusion
:
self
.
_init_edge_module
()
def
forward_train
(
self
,
x
,
input_metas
,
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
attr_labels
,
gt_bboxes_ignore
,
proposal_cfg
,
**
kwargs
):
"""
Args:
x (list[Tensor]): Features from FPN.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
shape (num_gts, self.bbox_code_size).
gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
shape (num_gts,).
centers2d (list[Tensor]): Projected 3D center of each box,
shape (num_gts, 2).
depths (list[Tensor]): Depth of projected 3D center of each box,
shape (num_gts,).
attr_labels (list[Tensor]): Attribute labels of each box,
shape (num_gts,).
gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
Returns:
tuple:
losses: (dict[str, Tensor]): A dictionary of loss components.
proposal_list (list[Tensor]): Proposals of each image.
"""
outs
=
self
(
x
,
input_metas
)
if
gt_labels
is
None
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_bboxes_3d
,
centers2d
,
depths
,
attr_labels
,
input_metas
)
else
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
attr_labels
,
input_metas
)
losses
=
self
.
loss
(
*
loss_inputs
,
gt_bboxes_ignore
=
gt_bboxes_ignore
)
if
proposal_cfg
is
None
:
return
losses
else
:
proposal_list
=
self
.
get_bboxes
(
*
outs
,
input_metas
,
cfg
=
proposal_cfg
)
return
losses
,
proposal_list
def
forward
(
self
,
feats
,
input_metas
):
"""Forward features from the upstream network.
Args:
feats (list[Tensor]): Features from the upstream network, each is
a 4D-tensor.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
"""
mlvl_input_metas
=
[
input_metas
for
i
in
range
(
len
(
feats
))]
return
multi_apply
(
self
.
forward_single
,
feats
,
mlvl_input_metas
)
def
forward_single
(
self
,
x
,
input_metas
):
"""Forward features of a single scale level.
Args:
x (Tensor): Feature maps from a specific FPN feature level.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple: Scores for each class, bbox predictions.
"""
img_h
,
img_w
=
input_metas
[
0
][
'pad_shape'
][:
2
]
batch_size
,
_
,
feat_h
,
feat_w
=
x
.
shape
downsample_ratio
=
img_h
/
feat_h
for
conv_cls_prev_layer
in
self
.
conv_cls_prev
:
cls_feat
=
conv_cls_prev_layer
(
x
)
out_cls
=
self
.
conv_cls
(
cls_feat
)
if
self
.
use_edge_fusion
:
# calculate the edge indices for the batch data
edge_indices_list
=
get_edge_indices
(
input_metas
,
downsample_ratio
,
device
=
x
.
device
)
edge_lens
=
[
edge_indices
.
shape
[
0
]
for
edge_indices
in
edge_indices_list
]
max_edge_len
=
max
(
edge_lens
)
edge_indices
=
x
.
new_zeros
((
batch_size
,
max_edge_len
,
2
),
dtype
=
torch
.
long
)
for
i
in
range
(
batch_size
):
edge_indices
[
i
,
:
edge_lens
[
i
]]
=
edge_indices_list
[
i
]
# cls feature map edge fusion
out_cls
=
self
.
edge_fuse_cls
(
cls_feat
,
out_cls
,
edge_indices
,
edge_lens
,
feat_h
,
feat_w
)
bbox_pred
=
[]
for
i
in
range
(
len
(
self
.
group_reg_dims
)):
reg_feat
=
x
.
clone
()
# feature regression head
if
len
(
self
.
reg_branch
[
i
])
>
0
:
for
conv_reg_prev_layer
in
self
.
conv_reg_prevs
[
i
]:
reg_feat
=
conv_reg_prev_layer
(
reg_feat
)
for
j
,
conv_reg
in
enumerate
(
self
.
conv_regs
[
i
]):
out_reg
=
conv_reg
(
reg_feat
)
# Use Edge Fusion Module
if
self
.
use_edge_fusion
and
(
i
,
j
)
in
self
.
edge_fusion_inds
:
# reg feature map edge fusion
out_reg
=
getattr
(
self
,
'edge_fuse_reg_{}_{}'
.
format
(
i
,
j
))(
reg_feat
,
out_reg
,
edge_indices
,
edge_lens
,
feat_h
,
feat_w
)
bbox_pred
.
append
(
out_reg
)
bbox_pred
=
torch
.
cat
(
bbox_pred
,
dim
=
1
)
cls_score
=
out_cls
.
sigmoid
()
# turn to 0-1
cls_score
=
cls_score
.
clamp
(
min
=
1e-4
,
max
=
1
-
1e-4
)
return
cls_score
,
bbox_pred
def
get_bboxes
(
self
,
cls_scores
,
bbox_preds
,
input_metas
):
"""Generate bboxes from bbox head predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
bbox_preds (list[Tensor]): Box regression for each scale.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If True, return boxes in original image space.
Returns:
list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
Each item in result_list is 4-tuple.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
1
cam2imgs
=
torch
.
stack
([
cls_scores
[
0
].
new_tensor
(
input_meta
[
'cam2img'
])
for
input_meta
in
input_metas
])
batch_bboxes
,
batch_scores
,
batch_topk_labels
=
self
.
decode_heatmap
(
cls_scores
[
0
],
bbox_preds
[
0
],
input_metas
,
cam2imgs
=
cam2imgs
,
topk
=
100
,
kernel
=
3
)
result_list
=
[]
for
img_id
in
range
(
len
(
input_metas
)):
bboxes
=
batch_bboxes
[
img_id
]
scores
=
batch_scores
[
img_id
]
labels
=
batch_topk_labels
[
img_id
]
keep_idx
=
scores
>
0.25
bboxes
=
bboxes
[
keep_idx
]
scores
=
scores
[
keep_idx
]
labels
=
labels
[
keep_idx
]
bboxes
=
input_metas
[
img_id
][
'box_type_3d'
](
bboxes
,
box_dim
=
self
.
bbox_code_size
,
origin
=
(
0.5
,
0.5
,
0.5
))
attrs
=
None
result_list
.
append
((
bboxes
,
scores
,
labels
,
attrs
))
return
result_list
def
decode_heatmap
(
self
,
cls_score
,
reg_pred
,
input_metas
,
cam2imgs
,
topk
=
100
,
kernel
=
3
):
"""Transform outputs into detections raw bbox predictions.
Args:
class_score (Tensor): Center predict heatmap,
shape (B, num_classes, H, W).
reg_pred (Tensor): Box regression map.
shape (B, channel, H , W).
input_metas (List[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cam2imgs (Tensor): Camera intrinsic matrix.
shape (N, 4, 4)
topk (int, optional): Get top k center keypoints from heatmap.
Default 100.
kernel (int, optional): Max pooling kernel for extract local
maximum pixels. Default 3.
Returns:
tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
the following Tensors:
- batch_bboxes (Tensor): Coords of each 3D box.
shape (B, k, 7)
- batch_scores (Tensor): Scores of each 3D box.
shape (B, k)
- batch_topk_labels (Tensor): Categories of each 3D box.
shape (B, k)
"""
img_h
,
img_w
=
input_metas
[
0
][
'pad_shape'
][:
2
]
batch_size
,
_
,
feat_h
,
feat_w
=
cls_score
.
shape
downsample_ratio
=
img_h
/
feat_h
center_heatmap_pred
=
get_local_maximum
(
cls_score
,
kernel
=
kernel
)
*
batch_dets
,
topk_ys
,
topk_xs
=
get_topk_from_heatmap
(
center_heatmap_pred
,
k
=
topk
)
batch_scores
,
batch_index
,
batch_topk_labels
=
batch_dets
regression
=
transpose_and_gather_feat
(
reg_pred
,
batch_index
)
regression
=
regression
.
view
(
-
1
,
8
)
pred_base_centers2d
=
torch
.
cat
(
[
topk_xs
.
view
(
-
1
,
1
),
topk_ys
.
view
(
-
1
,
1
).
float
()],
dim
=
1
)
preds
=
self
.
bbox_coder
.
decode
(
regression
,
batch_topk_labels
,
downsample_ratio
,
cam2imgs
)
pred_locations
=
self
.
bbox_coder
.
decode_location
(
pred_base_centers2d
,
preds
[
'offsets2d'
],
preds
[
'combined_depth'
],
cam2imgs
,
downsample_ratio
)
pred_yaws
=
self
.
bbox_coder
.
decode_orientation
(
preds
[
'orientations'
]).
unsqueeze
(
-
1
)
pred_dims
=
preds
[
'dimensions'
]
batch_bboxes
=
torch
.
cat
((
pred_locations
,
pred_dims
,
pred_yaws
),
dim
=
1
)
batch_bboxes
=
batch_bboxes
.
view
(
batch_size
,
-
1
,
self
.
bbox_code_size
)
return
batch_bboxes
,
batch_scores
,
batch_topk_labels
def
get_predictions
(
self
,
pred_reg
,
labels3d
,
centers2d
,
reg_mask
,
batch_indices
,
input_metas
,
downsample_ratio
):
"""Prepare predictions for computing loss.
Args:
pred_reg (Tensor): Box regression map.
shape (B, channel, H , W).
labels3d (Tensor): Labels of each 3D box.
shape (B * max_objs, )
centers2d (Tensor): Coords of each projected 3D box
center on image. shape (N, 2)
reg_mask (Tensor): Indexes of the existence of the 3D box.
shape (B * max_objs, )
batch_indices (Tenosr): Batch indices of the 3D box.
shape (N, 3)
input_metas (list[dict]): Meta information of each image,
e.g., image size, scaling factor, etc.
downsample_ratio (int): The stride of feature map.
Returns:
dict: The predictions for computing loss.
"""
batch
,
channel
=
pred_reg
.
shape
[
0
],
pred_reg
.
shape
[
1
]
w
=
pred_reg
.
shape
[
3
]
cam2imgs
=
torch
.
stack
([
centers2d
.
new_tensor
(
input_meta
[
'cam2img'
])
for
input_meta
in
input_metas
])
# (batch_size, 4, 4) -> (N, 4, 4)
cam2imgs
=
cam2imgs
[
batch_indices
,
:,
:]
centers2d_inds
=
centers2d
[:,
1
]
*
w
+
centers2d
[:,
0
]
centers2d_inds
=
centers2d_inds
.
view
(
batch
,
-
1
)
pred_regression
=
transpose_and_gather_feat
(
pred_reg
,
centers2d_inds
)
pred_regression_pois
=
pred_regression
.
view
(
-
1
,
channel
)[
reg_mask
]
preds
=
self
.
bbox_coder
.
decode
(
pred_regression_pois
,
labels3d
,
downsample_ratio
,
cam2imgs
)
return
preds
def
get_targets
(
self
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_3d_list
,
gt_labels_3d_list
,
centers2d_list
,
depths_list
,
feat_shape
,
img_shape
,
input_metas
):
"""Get training targets for batch images.
``
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
image, shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each
box, shape (num_gt,).
gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
Ground truth bboxes of each image,
shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
each box, shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D
image, shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
feat_shape (tuple[int]): Feature map shape with value,
shape (B, _, H, W).
img_shape (tuple[int]): Image shape in [h, w] format.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple[Tensor, dict]: The Tensor value is the targets of
center heatmap, the dict has components below:
- base_centers2d_target (Tensor): Coords of each projected 3D box
center on image. shape (B * max_objs, 2), [dtype: int]
- labels3d (Tensor): Labels of each 3D box.
shape (N, )
- reg_mask (Tensor): Mask of the existence of the 3D box.
shape (B * max_objs, )
- batch_indices (Tensor): Batch id of the 3D box.
shape (N, )
- depth_target (Tensor): Depth target of each 3D box.
shape (N, )
- keypoints2d_target (Tensor): Keypoints of each projected 3D box
on image. shape (N, 10, 2)
- keypoints_mask (Tensor): Keypoints mask of each projected 3D
box on image. shape (N, 10)
- keypoints_depth_mask (Tensor): Depths decoded from keypoints
of each 3D box. shape (N, 3)
- orientations_target (Tensor): Orientation (encoded local yaw)
target of each 3D box. shape (N, )
- offsets2d_target (Tensor): Offsets target of each projected
3D box. shape (N, 2)
- dimensions_target (Tensor): Dimensions target of each 3D box.
shape (N, 3)
- downsample_ratio (int): The stride of feature map.
"""
img_h
,
img_w
=
img_shape
[:
2
]
batch_size
,
_
,
feat_h
,
feat_w
=
feat_shape
width_ratio
=
float
(
feat_w
/
img_w
)
# 1/4
height_ratio
=
float
(
feat_h
/
img_h
)
# 1/4
assert
width_ratio
==
height_ratio
# Whether to filter the objects which are not in FOV.
if
self
.
filter_outside_objs
:
filter_outside_objs
(
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_3d_list
,
gt_labels_3d_list
,
centers2d_list
,
input_metas
)
# transform centers2d to base centers2d for regression and
# heatmap generation.
# centers2d = int(base_centers2d) + offsets2d
base_centers2d_list
,
offsets2d_list
,
trunc_mask_list
=
\
handle_proj_objs
(
centers2d_list
,
gt_bboxes_list
,
input_metas
)
keypoints2d_list
,
keypoints_mask_list
,
keypoints_depth_mask_list
=
\
get_keypoints
(
gt_bboxes_3d_list
,
centers2d_list
,
input_metas
)
center_heatmap_target
=
gt_bboxes_list
[
-
1
].
new_zeros
(
[
batch_size
,
self
.
num_classes
,
feat_h
,
feat_w
])
for
batch_id
in
range
(
batch_size
):
# project gt_bboxes from input image to feat map
gt_bboxes
=
gt_bboxes_list
[
batch_id
]
*
width_ratio
gt_labels
=
gt_labels_list
[
batch_id
]
# project base centers2d from input image to feat map
gt_base_centers2d
=
base_centers2d_list
[
batch_id
]
*
width_ratio
trunc_masks
=
trunc_mask_list
[
batch_id
]
for
j
,
base_center2d
in
enumerate
(
gt_base_centers2d
):
if
trunc_masks
[
j
]:
# for outside objects, generate ellipse heatmap
base_center2d_x_int
,
base_center2d_y_int
=
\
base_center2d
.
int
()
scale_box_w
=
min
(
base_center2d_x_int
-
gt_bboxes
[
j
][
0
],
gt_bboxes
[
j
][
2
]
-
base_center2d_x_int
)
scale_box_h
=
min
(
base_center2d_y_int
-
gt_bboxes
[
j
][
1
],
gt_bboxes
[
j
][
3
]
-
base_center2d_y_int
)
radius_x
=
scale_box_w
*
self
.
edge_heatmap_ratio
radius_y
=
scale_box_h
*
self
.
edge_heatmap_ratio
radius_x
,
radius_y
=
max
(
0
,
int
(
radius_x
)),
max
(
0
,
int
(
radius_y
))
assert
min
(
radius_x
,
radius_y
)
==
0
ind
=
gt_labels
[
j
]
get_ellip_gaussian_2D
(
center_heatmap_target
[
batch_id
,
ind
],
[
base_center2d_x_int
,
base_center2d_y_int
],
radius_x
,
radius_y
)
else
:
base_center2d_x_int
,
base_center2d_y_int
=
\
base_center2d
.
int
()
scale_box_h
=
(
gt_bboxes
[
j
][
3
]
-
gt_bboxes
[
j
][
1
])
scale_box_w
=
(
gt_bboxes
[
j
][
2
]
-
gt_bboxes
[
j
][
0
])
radius
=
gaussian_radius
([
scale_box_h
,
scale_box_w
],
min_overlap
=
0.7
)
radius
=
max
(
0
,
int
(
radius
))
ind
=
gt_labels
[
j
]
gen_gaussian_target
(
center_heatmap_target
[
batch_id
,
ind
],
[
base_center2d_x_int
,
base_center2d_y_int
],
radius
)
avg_factor
=
max
(
1
,
center_heatmap_target
.
eq
(
1
).
sum
())
num_ctrs
=
[
centers2d
.
shape
[
0
]
for
centers2d
in
centers2d_list
]
max_objs
=
max
(
num_ctrs
)
batch_indices
=
[
centers2d_list
[
0
].
new_full
((
num_ctrs
[
i
],
),
i
)
for
i
in
range
(
batch_size
)
]
batch_indices
=
torch
.
cat
(
batch_indices
,
dim
=
0
)
reg_mask
=
torch
.
zeros
(
(
batch_size
,
max_objs
),
dtype
=
torch
.
bool
).
to
(
base_centers2d_list
[
0
].
device
)
gt_bboxes_3d
=
input_metas
[
'box_type_3d'
].
cat
(
gt_bboxes_3d_list
)
gt_bboxes_3d
=
gt_bboxes_3d
.
to
(
base_centers2d_list
[
0
].
device
)
# encode original local yaw to multibin format
orienations_target
=
self
.
bbox_coder
.
encode
(
gt_bboxes_3d
)
batch_base_centers2d
=
base_centers2d_list
[
0
].
new_zeros
(
(
batch_size
,
max_objs
,
2
))
for
i
in
range
(
batch_size
):
reg_mask
[
i
,
:
num_ctrs
[
i
]]
=
1
batch_base_centers2d
[
i
,
:
num_ctrs
[
i
]]
=
base_centers2d_list
[
i
]
flatten_reg_mask
=
reg_mask
.
flatten
()
# transform base centers2d from input scale to output scale
batch_base_centers2d
=
batch_base_centers2d
.
view
(
-
1
,
2
)
*
width_ratio
dimensions_target
=
gt_bboxes_3d
.
tensor
[:,
3
:
6
]
labels_3d
=
torch
.
cat
(
gt_labels_3d_list
)
keypoints2d_target
=
torch
.
cat
(
keypoints2d_list
)
keypoints_mask
=
torch
.
cat
(
keypoints_mask_list
)
keypoints_depth_mask
=
torch
.
cat
(
keypoints_depth_mask_list
)
offsets2d_target
=
torch
.
cat
(
offsets2d_list
)
bboxes2d
=
torch
.
cat
(
gt_bboxes_list
)
# transform FCOS style bbox into [x1, y1, x2, y2] format.
bboxes2d_target
=
torch
.
cat
([
bboxes2d
[:,
0
:
2
]
*
-
1
,
bboxes2d
[:,
2
:]],
dim
=-
1
)
depths
=
torch
.
cat
(
depths_list
)
target_labels
=
dict
(
base_centers2d_target
=
batch_base_centers2d
.
int
(),
labels3d
=
labels_3d
,
reg_mask
=
flatten_reg_mask
,
batch_indices
=
batch_indices
,
bboxes2d_target
=
bboxes2d_target
,
depth_target
=
depths
,
keypoints2d_target
=
keypoints2d_target
,
keypoints_mask
=
keypoints_mask
,
keypoints_depth_mask
=
keypoints_depth_mask
,
orienations_target
=
orienations_target
,
offsets2d_target
=
offsets2d_target
,
dimensions_target
=
dimensions_target
,
downsample_ratio
=
1
/
width_ratio
)
return
center_heatmap_target
,
avg_factor
,
target_labels
def
loss
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
attr_labels
,
input_metas
,
gt_bboxes_ignore
=
None
):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
shape (num_gt, 4).
bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
number is bbox_code_size.
shape (B, 7, H, W).
gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box.
shape (num_gts, ).
gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
truth. it is the flipped gt_bboxes
gt_labels_3d (list[Tensor]): Same as gt_labels.
centers2d (list[Tensor]): 2D centers on the image.
shape (num_gts, 2).
depths (list[Tensor]): Depth ground truth.
shape (num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
In kitti it's None.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.
Default: None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
1
assert
attr_labels
is
None
assert
gt_bboxes_ignore
is
None
center2d_heatmap
=
cls_scores
[
0
]
pred_reg
=
bbox_preds
[
0
]
center2d_heatmap_target
,
avg_factor
,
target_labels
=
\
self
.
get_targets
(
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
center2d_heatmap
.
shape
,
input_metas
[
0
][
'pad_shape'
],
input_metas
)
preds
=
self
.
get_predictions
(
pred_reg
=
pred_reg
,
labels3d
=
target_labels
[
'labels3d'
],
centers2d
=
target_labels
[
'base_centers2d_target'
],
reg_mask
=
target_labels
[
'reg_mask'
],
batch_indices
=
target_labels
[
'batch_indices'
],
input_metas
=
input_metas
,
downsample_ratio
=
target_labels
[
'downsample_ratio'
])
# heatmap loss
loss_cls
=
self
.
loss_cls
(
center2d_heatmap
,
center2d_heatmap_target
,
avg_factor
=
avg_factor
)
# bbox2d regression loss
loss_bbox
=
self
.
loss_bbox
(
preds
[
'bboxes2d'
],
target_labels
[
'bboxes2d_target'
])
# keypoints loss, the keypoints in predictions and target are all
# local coordinates. Check the mask dtype should be bool, not int
# or float to ensure the indexing is bool index
keypoints2d_mask
=
target_labels
[
'keypoints2d_mask'
]
loss_keypoints
=
self
.
loss_keypoints
(
preds
[
'keypoints2d'
][
keypoints2d_mask
],
target_labels
[
'keypoints2d_target'
][
keypoints2d_mask
])
# orientations loss
loss_dir
=
self
.
loss_dir
(
preds
[
'orientations'
],
target_labels
[
'orientations_target'
])
# dimensions loss
loss_dims
=
self
.
loss_dims
(
preds
[
'dimensions'
],
target_labels
[
'dimensions_target'
])
# offsets for center heatmap
loss_offsets2d
=
self
.
loss_offsets2d
(
preds
[
'offsets2d'
],
target_labels
[
'offsets2d_target'
])
# directly regressed depth loss with direct depth uncertainty loss
direct_depth_weights
=
torch
.
exp
(
-
preds
[
'direct_depth_uncertainty'
])
loss_weight_1
=
self
.
loss_direct_depth
.
loss_weight
loss_direct_depth
=
self
.
loss_direct_depth
(
preds
[
'direct_depth'
],
target_labels
[
'depth_target'
],
direct_depth_weights
)
loss_uncertainty_1
=
\
preds
[
'direct_depth_uncertainty'
]
*
loss_weight_1
loss_direct_depth
=
loss_direct_depth
+
loss_uncertainty_1
.
mean
()
# keypoints decoded depth loss with keypoints depth uncertainty loss
depth_mask
=
target_labels
[
'keypoints_depth_mask'
]
depth_target
=
target_labels
[
'depth_target'
].
unsqueeze
(
-
1
).
repeat
(
1
,
3
)
valid_keypoints_depth_uncertainty
=
preds
[
'keypoints_depth_uncertainty'
][
depth_mask
]
valid_keypoints_depth_weights
=
torch
.
exp
(
-
valid_keypoints_depth_uncertainty
)
loss_keypoints_depth
=
self
.
loss_keypoint_depth
(
preds
[
'keypoints_depth'
][
depth_mask
],
depth_target
[
depth_mask
],
valid_keypoints_depth_weights
)
loss_weight_2
=
self
.
loss_keypoints_depth
.
loss_weight
loss_uncertainty_2
=
\
valid_keypoints_depth_uncertainty
*
loss_weight_2
loss_keypoints_depth
=
loss_keypoints_depth
+
loss_uncertainty_2
.
mean
()
# combined depth loss for optimiaze the uncertainty
loss_combined_depth
=
self
.
loss_combined_depth
(
preds
[
'combined_depth'
],
target_labels
[
'depth_target'
])
loss_dict
=
dict
(
loss_cls
=
loss_cls
,
loss_bbox
=
loss_bbox
,
loss_keypoints
=
loss_keypoints
,
loss_dir
=
loss_dir
,
loss_dims
=
loss_dims
,
loss_offsets2d
=
loss_offsets2d
,
loss_direct_depth
=
loss_direct_depth
,
loss_keypoints_depth
=
loss_keypoints_depth
,
loss_combined_depth
=
loss_combined_depth
)
return
loss_dict
mmdet3d/models/model_utils/__init__.py
View file @
8538177b
# Copyright (c) OpenMMLab. All rights reserved.
from
.edge_fusion_module
import
EdgeFusionModule
from
.transformer
import
GroupFree3DMHA
from
.vote_module
import
VoteModule
__all__
=
[
'VoteModule'
,
'GroupFree3DMHA'
]
__all__
=
[
'VoteModule'
,
'GroupFree3DMHA'
,
'EdgeFusionModule'
]
mmdet3d/models/model_utils/edge_fusion_module.py
0 → 100644
View file @
8538177b
from
mmcv.cnn
import
ConvModule
from
mmcv.runner
import
BaseModule
from
torch
import
nn
as
nn
from
torch.nn
import
functional
as
F
class
EdgeFusionModule
(
BaseModule
):
"""Edge Fusion Module for feature map.
Args:
out_channels (int): The number of output channels.
feat_channels (int): The number of channels in feature map
during edge feature fusion.
kernel_size (int, optional): Kernel size of convolution.
Default: 3.
act_cfg (dict, optional): Config of activation.
Default: dict(type='ReLU').
norm_cfg (dict, optional): Config of normalization.
Default: dict(type='BN1d')).
"""
def
__init__
(
self
,
out_channels
,
feat_channels
,
kernel_size
=
3
,
act_cfg
=
dict
(
type
=
'ReLU'
),
norm_cfg
=
dict
(
type
=
'BN1d'
)):
super
().
__init__
()
self
.
edge_convs
=
nn
.
Sequential
(
ConvModule
(
feat_channels
,
feat_channels
,
kernel_size
=
kernel_size
,
padding
=
kernel_size
//
2
,
conv_cfg
=
dict
(
type
=
'Conv1d'
),
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
),
nn
.
Conv1d
(
feat_channels
,
out_channels
,
kernel_size
=
1
))
self
.
feat_channels
=
feat_channels
def
forward
(
self
,
features
,
fused_features
,
edge_indices
,
edge_lens
,
output_h
,
output_w
):
"""Forward pass.
Args:
features (torch.Tensor): Different representative features
for fusion.
fused_features (torch.Tensor): Different representative
features to be fused.
edge_indices (torch.Tensor): Batch image edge indices.
edge_lens (list[int]): List of edge length of each image.
output_h (int): Height of output feature map.
output_w (int): Width of output feature map.
Returns:
torch.Tensor: Fused feature maps.
"""
batch_size
=
features
.
shape
[
0
]
# normalize
grid_edge_indices
=
edge_indices
.
view
(
batch_size
,
-
1
,
1
,
2
).
float
()
grid_edge_indices
[...,
0
]
=
\
grid_edge_indices
[...,
0
]
/
(
output_w
-
1
)
*
2
-
1
grid_edge_indices
[...,
1
]
=
\
grid_edge_indices
[...,
1
]
/
(
output_h
-
1
)
*
2
-
1
# apply edge fusion
edge_features
=
F
.
grid_sample
(
features
,
grid_edge_indices
,
align_corners
=
True
).
squeeze
(
-
1
)
edge_output
=
self
.
edge_convs
(
edge_features
)
for
k
in
range
(
batch_size
):
edge_indice_k
=
edge_indices
[
k
,
:
edge_lens
[
k
]]
fused_features
[
k
,
:,
edge_indice_k
[:,
1
],
edge_indice_k
[:,
0
]]
+=
edge_output
[
k
,
:,
:
edge_lens
[
k
]]
return
fused_features
mmdet3d/models/utils/edge_indices.py
View file @
8538177b
...
...
@@ -4,6 +4,7 @@ import torch
def
get_edge_indices
(
img_metas
,
downsample_ratio
,
step
=
1
,
pad_mode
=
'default'
,
dtype
=
np
.
float32
,
...
...
@@ -17,6 +18,7 @@ def get_edge_indices(img_metas,
Args:
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
downsample_ratio (int): Downsample ratio of output feature,
step (int, optional): Step size used for generateing
edge indices. Default: 1.
pad_mode (str, optional): Padding mode during data pipeline.
...
...
@@ -32,13 +34,21 @@ def get_edge_indices(img_metas,
edge_indices_list
=
[]
for
i
in
range
(
len
(
img_metas
)):
img_shape
=
img_metas
[
i
][
'img_shape'
]
pad_shape
=
img_metas
[
i
][
'pad_shape'
]
h
,
w
=
img_shape
[:
2
]
pad_h
,
pad_w
=
pad_shape
edge_indices
=
[]
if
pad_mode
==
'default'
:
x_min
=
0
y_min
=
0
x_max
,
y_max
=
w
-
1
,
h
-
1
x_max
=
(
w
-
1
)
//
downsample_ratio
y_max
=
(
h
-
1
)
//
downsample_ratio
elif
pad_mode
==
'center'
:
x_min
=
np
.
ceil
((
pad_w
-
w
)
/
2
*
downsample_ratio
)
y_min
=
np
.
ceil
((
pad_h
-
h
)
/
2
*
downsample_ratio
)
x_max
=
x_min
+
w
//
downsample_ratio
y_max
=
y_min
+
h
//
downsample_ratio
else
:
raise
NotImplementedError
...
...
tests/test_models/test_heads/test_heads.py
View file @
8538177b
...
...
@@ -1505,3 +1505,62 @@ def test_pgd_head():
assert
results
[
0
][
2
].
shape
==
torch
.
Size
([
20
])
assert
results
[
0
][
3
]
is
None
assert
results
[
0
][
4
].
shape
==
torch
.
Size
([
20
,
5
])
def
test_monoflex_head
():
head_cfg
=
dict
(
type
=
'MonoFlexHead'
,
num_classes
=
3
,
in_channels
=
64
,
use_edge_fusion
=
True
,
edge_fusion_inds
=
[(
1
,
0
)],
edge_heatmap_ratio
=
1
/
8
,
stacked_convs
=
0
,
feat_channels
=
64
,
use_direction_classifier
=
False
,
diff_rad_by_sin
=
False
,
pred_attrs
=
False
,
pred_velo
=
False
,
dir_offset
=
0
,
strides
=
None
,
group_reg_dims
=
((
4
,
),
(
2
,
),
(
20
,
),
(
3
,
),
(
3
,
),
(
8
,
8
),
(
1
,
),
(
1
,
)),
cls_branch
=
(
256
,
),
reg_branch
=
((
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
)),
num_attrs
=
0
,
bbox_code_size
=
7
,
dir_branch
=
(),
attr_branch
=
(),
bbox_coder
=
dict
(
type
=
'MonoFlexCoder'
,
depth_mode
=
'exp'
,
base_depth
=
(
26.494627
,
16.05988
),
depth_range
=
[
0.1
,
100
],
combine_depth
=
True
,
uncertainty_range
=
[
-
10
,
10
],
base_dims
=
((
3.8840
,
1.5261
,
1.6286
,
0.4259
,
0.1367
,
0.1022
),
(
0.8423
,
1.7607
,
0.6602
,
0.2349
,
0.1133
,
0.1427
),
(
1.7635
,
1.7372
,
0.5968
,
0.1766
,
0.0948
,
0.1242
)),
dims_mode
=
'linear'
,
multibin
=
True
,
num_dir_bins
=
4
,
bin_centers
=
[
0
,
np
.
pi
/
2
,
np
.
pi
,
-
np
.
pi
/
2
],
bin_margin
=
np
.
pi
/
6
,
code_size
=
7
),
conv_bias
=
True
,
dcn_on_last_conv
=
False
)
self
=
build_head
(
head_cfg
)
feats
=
[
torch
.
rand
([
2
,
64
,
32
,
32
],
dtype
=
torch
.
float32
)]
input_metas
=
[
dict
(
img_shape
=
(
110
,
110
),
pad_shape
=
(
128
,
128
)),
dict
(
img_shape
=
(
98
,
110
),
pad_shape
=
(
128
,
128
))
]
cls_score
,
out_reg
=
self
(
feats
,
input_metas
)
assert
cls_score
[
0
].
shape
==
torch
.
Size
([
2
,
3
,
32
,
32
])
assert
out_reg
[
0
].
shape
==
torch
.
Size
([
2
,
50
,
32
,
32
])
tests/test_utils/test_bbox_coders.py
View file @
8538177b
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
import
torch
from
mmcv.cnn
import
Scale
from
torch
import
nn
as
nn
...
...
@@ -596,3 +597,69 @@ def test_smoke_bbox_coder():
locations
=
torch
.
tensor
([[
15.
,
2.
,
1.
],
[
15.
,
2.
,
-
1.
]])
orientations
=
bbox_coder
.
_decode_orientation
(
ori_vector
,
locations
)
assert
orientations
.
shape
==
torch
.
Size
([
2
,
1
])
def
test_monoflex_bbox_coder
():
bbox_coder_cfg
=
dict
(
type
=
'MonoFlexCoder'
,
depth_mode
=
'exp'
,
base_depth
=
(
26.494627
,
16.05988
),
depth_range
=
[
0.1
,
100
],
combine_depth
=
True
,
uncertainty_range
=
[
-
10
,
10
],
base_dims
=
((
3.8840
,
1.5261
,
1.6286
,
0.4259
,
0.1367
,
0.1022
),
(
0.8423
,
1.7607
,
0.6602
,
0.2349
,
0.1133
,
0.1427
),
(
1.7635
,
1.7372
,
0.5968
,
0.1766
,
0.0948
,
0.1242
)),
dims_mode
=
'linear'
,
multibin
=
True
,
num_dir_bins
=
4
,
bin_centers
=
[
0
,
np
.
pi
/
2
,
np
.
pi
,
-
np
.
pi
/
2
],
bin_margin
=
np
.
pi
/
6
,
code_size
=
7
)
bbox_coder
=
build_bbox_coder
(
bbox_coder_cfg
)
gt_bboxes_3d
=
CameraInstance3DBoxes
(
torch
.
rand
([
6
,
7
]))
orientation_target
=
bbox_coder
.
encode
(
gt_bboxes_3d
)
assert
orientation_target
.
shape
==
torch
.
Size
([
6
,
8
])
regression
=
torch
.
rand
([
100
,
50
])
base_centers2d
=
torch
.
rand
([
100
,
2
])
labels
=
torch
.
ones
([
100
])
downsample_ratio
=
4
cam2imgs
=
torch
.
rand
([
100
,
4
,
4
])
preds
=
bbox_coder
.
decode
(
regression
,
base_centers2d
,
labels
,
downsample_ratio
,
cam2imgs
)
assert
preds
[
'bboxes2d'
].
shape
==
torch
.
Size
([
100
,
4
])
assert
preds
[
'dimensions'
].
shape
==
torch
.
Size
([
100
,
3
])
assert
preds
[
'offsets2d'
].
shape
==
torch
.
Size
([
100
,
2
])
assert
preds
[
'keypoints2d'
].
shape
==
torch
.
Size
([
100
,
10
,
2
])
assert
preds
[
'orientations'
].
shape
==
torch
.
Size
([
100
,
16
])
assert
preds
[
'direct_depth'
].
shape
==
torch
.
Size
([
100
,
])
assert
preds
[
'keypoints_depth'
].
shape
==
torch
.
Size
([
100
,
3
])
assert
preds
[
'combined_depth'
].
shape
==
torch
.
Size
([
100
,
])
assert
preds
[
'direct_depth_uncertainty'
].
shape
==
torch
.
Size
([
100
,
])
assert
preds
[
'keypoints_depth_uncertainty'
].
shape
==
torch
.
Size
([
100
,
3
])
offsets_2d
=
torch
.
randn
([
100
,
2
])
depths
=
torch
.
randn
([
100
,
])
locations
=
bbox_coder
.
decode_location
(
base_centers2d
,
offsets_2d
,
depths
,
cam2imgs
,
downsample_ratio
)
assert
locations
.
shape
==
torch
.
Size
([
100
,
3
])
orientations
=
torch
.
randn
([
100
,
16
])
yaws
,
local_yaws
=
bbox_coder
.
decode_orientation
(
orientations
,
locations
)
assert
yaws
.
shape
==
torch
.
Size
([
100
,
])
assert
local_yaws
.
shape
==
torch
.
Size
([
100
,
])
tests/test_utils/test_utils.py
View file @
8538177b
...
...
@@ -195,11 +195,15 @@ def test_points_img2cam():
def
test_generate_edge_indices
():
img_metas
=
[
dict
(
img_shape
=
[
300
,
400
]),
dict
(
img_shape
=
[
500
,
450
])]
edge_indices_list
=
get_edge_indices
(
img_metas
)
input_metas
=
[
dict
(
img_shape
=
(
110
,
110
),
pad_shape
=
(
128
,
128
)),
dict
(
img_shape
=
(
98
,
110
),
pad_shape
=
(
128
,
128
))
]
downsample_ratio
=
4
edge_indices_list
=
get_edge_indices
(
input_metas
,
downsample_ratio
)
assert
edge_indices_list
[
0
].
shape
[
0
]
==
1
396
assert
edge_indices_list
[
1
].
shape
[
0
]
==
1
896
assert
edge_indices_list
[
0
].
shape
[
0
]
==
1
08
assert
edge_indices_list
[
1
].
shape
[
0
]
==
1
02
def
test_truncation_hanlde
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment