Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
8538177b
"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "e013bab5674e8d35d1998a050e1fa239ac9a747d"
Unverified
Commit
8538177b
authored
Jan 21, 2022
by
ChaimZhu
Committed by
GitHub
Jan 21, 2022
Browse files
[Feature] Add MonoFlex Head (#1044)
parent
4590418e
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1093 additions
and
26 deletions
+1093
-26
mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
+13
-13
mmdet3d/core/bbox/structures/utils.py
mmdet3d/core/bbox/structures/utils.py
+7
-4
mmdet3d/core/utils/__init__.py
mmdet3d/core/utils/__init__.py
+4
-2
mmdet3d/core/utils/gaussian.py
mmdet3d/core/utils/gaussian.py
+72
-0
mmdet3d/models/dense_heads/__init__.py
mmdet3d/models/dense_heads/__init__.py
+3
-1
mmdet3d/models/dense_heads/monoflex_head.py
mmdet3d/models/dense_heads/monoflex_head.py
+770
-0
mmdet3d/models/model_utils/__init__.py
mmdet3d/models/model_utils/__init__.py
+2
-1
mmdet3d/models/model_utils/edge_fusion_module.py
mmdet3d/models/model_utils/edge_fusion_module.py
+77
-0
mmdet3d/models/utils/edge_indices.py
mmdet3d/models/utils/edge_indices.py
+11
-1
tests/test_models/test_heads/test_heads.py
tests/test_models/test_heads/test_heads.py
+59
-0
tests/test_utils/test_bbox_coders.py
tests/test_utils/test_bbox_coders.py
+67
-0
tests/test_utils/test_utils.py
tests/test_utils/test_utils.py
+8
-4
No files found.
mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
View file @
8538177b
...
@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder):
...
@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder):
torch.Tensor: Targets of orientations.
torch.Tensor: Targets of orientations.
"""
"""
local_yaw
=
gt_bboxes_3d
.
local_yaw
local_yaw
=
gt_bboxes_3d
.
local_yaw
# encode local yaw (-pi ~ pi) to multibin format
# encode local yaw (-pi ~ pi) to multibin format
encode_local_yaw
=
np
.
zeros
(
self
.
num_dir_bins
*
2
)
encode_local_yaw
=
local_yaw
.
new_zeros
(
[
local_yaw
.
shape
[
0
],
self
.
num_dir_bins
*
2
])
bin_size
=
2
*
np
.
pi
/
self
.
num_dir_bins
bin_size
=
2
*
np
.
pi
/
self
.
num_dir_bins
margin_size
=
bin_size
*
self
.
bin_margin
margin_size
=
bin_size
*
self
.
bin_margin
bin_centers
=
self
.
bin_centers
bin_centers
=
local_yaw
.
new_tensor
(
self
.
bin_centers
)
range_size
=
bin_size
/
2
+
margin_size
range_size
=
bin_size
/
2
+
margin_size
offsets
=
local_yaw
-
bin_centers
.
unsqueeze
(
0
)
offsets
=
local_yaw
.
unsqueeze
(
1
)
-
bin_centers
.
unsqueeze
(
0
)
offsets
[
offsets
>
np
.
pi
]
=
offsets
[
offsets
>
np
.
pi
]
-
2
*
np
.
pi
offsets
[
offsets
>
np
.
pi
]
=
offsets
[
offsets
>
np
.
pi
]
-
2
*
np
.
pi
offsets
[
offsets
<
-
np
.
pi
]
=
offsets
[
offsets
<
-
np
.
pi
]
+
2
*
np
.
pi
offsets
[
offsets
<
-
np
.
pi
]
=
offsets
[
offsets
<
-
np
.
pi
]
+
2
*
np
.
pi
...
@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder):
...
@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder):
offset
=
offsets
[:,
i
]
offset
=
offsets
[:,
i
]
inds
=
abs
(
offset
)
<
range_size
inds
=
abs
(
offset
)
<
range_size
encode_local_yaw
[
inds
,
i
]
=
1
encode_local_yaw
[
inds
,
i
]
=
1
encode_local_yaw
[
inds
,
i
+
self
.
num_dir_bins
]
=
offset
encode_local_yaw
[
inds
,
i
+
self
.
num_dir_bins
]
=
offset
[
inds
]
orientation_target
=
encode_local_yaw
orientation_target
=
encode_local_yaw
...
@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder):
...
@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder):
pred_direct_depth_uncertainty
=
bbox
[:,
49
:
50
].
squeeze
(
-
1
)
pred_direct_depth_uncertainty
=
bbox
[:,
49
:
50
].
squeeze
(
-
1
)
# 2 dimension of offsets x keypoints (8 corners + top/bottom center)
# 2 dimension of offsets x keypoints (8 corners + top/bottom center)
pred_keypoints2d
=
bbox
[:,
6
:
26
]
pred_keypoints2d
=
bbox
[:,
6
:
26
]
.
reshape
(
-
1
,
10
,
2
)
# 1 dimension for depth offsets
# 1 dimension for depth offsets
pred_direct_depth_offsets
=
bbox
[:,
48
:
49
].
squeeze
(
-
1
)
pred_direct_depth_offsets
=
bbox
[:,
48
:
49
].
squeeze
(
-
1
)
...
@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder):
...
@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder):
raise
NotImplementedError
raise
NotImplementedError
# (N, 3)
# (N, 3)
centers2d_img
=
\
centers2d_img
=
\
torch
.
cat
(
centers2d_img
,
depths
.
unsqueeze
(
-
1
),
dim
=
1
)
torch
.
cat
(
(
centers2d_img
,
depths
.
unsqueeze
(
-
1
)
)
,
dim
=
1
)
# (N, 4, 1)
# (N, 4, 1)
centers2d_extend
=
\
centers2d_extend
=
\
torch
.
cat
((
centers2d_img
,
centers2d_img
.
new_ones
(
N
,
1
)),
torch
.
cat
((
centers2d_img
,
centers2d_img
.
new_ones
(
N
,
1
)),
dim
=
1
).
unqueeze
(
-
1
)
dim
=
1
).
un
s
queeze
(
-
1
)
locations
=
torch
.
matmul
(
cam2imgs_inv
,
centers2d_extend
).
squeeze
(
-
1
)
locations
=
torch
.
matmul
(
cam2imgs_inv
,
centers2d_extend
).
squeeze
(
-
1
)
return
locations
[:,
:
3
]
return
locations
[:,
:
3
]
...
@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder):
...
@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder):
local_yaws
=
orientations
local_yaws
=
orientations
yaws
=
local_yaws
+
rays
yaws
=
local_yaws
+
rays
larger_idx
=
(
yaws
>
np
.
pi
).
nonzero
()
larger_idx
=
(
yaws
>
np
.
pi
).
nonzero
(
as_tuple
=
False
)
small_idx
=
(
yaws
<
-
np
.
pi
).
nonzero
()
small_idx
=
(
yaws
<
-
np
.
pi
).
nonzero
(
as_tuple
=
False
)
if
len
(
larger_idx
)
!=
0
:
if
len
(
larger_idx
)
!=
0
:
yaws
[
larger_idx
]
-=
2
*
np
.
pi
yaws
[
larger_idx
]
-=
2
*
np
.
pi
if
len
(
small_idx
)
!=
0
:
if
len
(
small_idx
)
!=
0
:
yaws
[
small_idx
]
+=
2
*
np
.
pi
yaws
[
small_idx
]
+=
2
*
np
.
pi
larger_idx
=
(
local_yaws
>
np
.
pi
).
nonzero
()
larger_idx
=
(
local_yaws
>
np
.
pi
).
nonzero
(
as_tuple
=
False
)
small_idx
=
(
local_yaws
<
-
np
.
pi
).
nonzero
()
small_idx
=
(
local_yaws
<
-
np
.
pi
).
nonzero
(
as_tuple
=
False
)
if
len
(
larger_idx
)
!=
0
:
if
len
(
larger_idx
)
!=
0
:
local_yaws
[
larger_idx
]
-=
2
*
np
.
pi
local_yaws
[
larger_idx
]
-=
2
*
np
.
pi
if
len
(
small_idx
)
!=
0
:
if
len
(
small_idx
)
!=
0
:
...
@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder):
...
@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder):
return
bboxes2d
return
bboxes2d
def
combine_depths
(
depth
,
depth_uncertainty
):
def
combine_depths
(
self
,
depth
,
depth_uncertainty
):
"""Combine all the prediced depths with depth uncertainty.
"""Combine all the prediced depths with depth uncertainty.
Args:
Args:
...
...
mmdet3d/core/bbox/structures/utils.py
View file @
8538177b
...
@@ -324,8 +324,11 @@ def yaw2local(yaw, loc):
...
@@ -324,8 +324,11 @@ def yaw2local(yaw, loc):
torch.Tensor: local yaw (alpha in kitti).
torch.Tensor: local yaw (alpha in kitti).
"""
"""
local_yaw
=
yaw
-
torch
.
atan2
(
loc
[:,
0
],
loc
[:,
2
])
local_yaw
=
yaw
-
torch
.
atan2
(
loc
[:,
0
],
loc
[:,
2
])
while
local_yaw
>
np
.
pi
:
larger_idx
=
(
local_yaw
>
np
.
pi
).
nonzero
(
as_tuple
=
False
)
local_yaw
-=
np
.
pi
*
2
small_idx
=
(
local_yaw
<
-
np
.
pi
).
nonzero
(
as_tuple
=
False
)
while
local_yaw
<
-
np
.
pi
:
if
len
(
larger_idx
)
!=
0
:
local_yaw
+=
np
.
pi
*
2
local_yaw
[
larger_idx
]
-=
2
*
np
.
pi
if
len
(
small_idx
)
!=
0
:
local_yaw
[
small_idx
]
+=
2
*
np
.
pi
return
local_yaw
return
local_yaw
mmdet3d/core/utils/__init__.py
View file @
8538177b
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
from
.array_converter
import
ArrayConverter
,
array_converter
from
.array_converter
import
ArrayConverter
,
array_converter
from
.gaussian
import
draw_heatmap_gaussian
,
gaussian_2d
,
gaussian_radius
from
.gaussian
import
(
draw_heatmap_gaussian
,
ellip_gaussian2D
,
gaussian_2d
,
gaussian_radius
,
get_ellip_gaussian_2D
)
__all__
=
[
__all__
=
[
'gaussian_2d'
,
'gaussian_radius'
,
'draw_heatmap_gaussian'
,
'gaussian_2d'
,
'gaussian_radius'
,
'draw_heatmap_gaussian'
,
'ArrayConverter'
,
'array_converter'
'ArrayConverter'
,
'array_converter'
,
'ellip_gaussian2D'
,
'get_ellip_gaussian_2D'
]
]
mmdet3d/core/utils/gaussian.py
View file @
8538177b
...
@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5):
...
@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5):
sq3
=
torch
.
sqrt
(
b3
**
2
-
4
*
a3
*
c3
)
sq3
=
torch
.
sqrt
(
b3
**
2
-
4
*
a3
*
c3
)
r3
=
(
b3
+
sq3
)
/
2
r3
=
(
b3
+
sq3
)
/
2
return
min
(
r1
,
r2
,
r3
)
return
min
(
r1
,
r2
,
r3
)
def
get_ellip_gaussian_2D
(
heatmap
,
center
,
radius_x
,
radius_y
,
k
=
1
):
"""Generate 2D ellipse gaussian heatmap.
Args:
heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
it and maintain the max value.
center (list[int]): Coord of gaussian kernel's center.
radius_x (int): X-axis radius of gaussian kernel.
radius_y (int): Y-axis radius of gaussian kernel.
k (int, optional): Coefficient of gaussian kernel. Default: 1.
Returns:
out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
"""
diameter_x
,
diameter_y
=
2
*
radius_x
+
1
,
2
*
radius_y
+
1
gaussian_kernel
=
ellip_gaussian2D
((
radius_x
,
radius_y
),
sigma_x
=
diameter_x
/
6
,
sigma_y
=
diameter_y
/
6
,
dtype
=
heatmap
.
dtype
,
device
=
heatmap
.
device
)
x
,
y
=
int
(
center
[
0
]),
int
(
center
[
1
])
height
,
width
=
heatmap
.
shape
[
0
:
2
]
left
,
right
=
min
(
x
,
radius_x
),
min
(
width
-
x
,
radius_x
+
1
)
top
,
bottom
=
min
(
y
,
radius_y
),
min
(
height
-
y
,
radius_y
+
1
)
masked_heatmap
=
heatmap
[
y
-
top
:
y
+
bottom
,
x
-
left
:
x
+
right
]
masked_gaussian
=
gaussian_kernel
[
radius_y
-
top
:
radius_y
+
bottom
,
radius_x
-
left
:
radius_x
+
right
]
out_heatmap
=
heatmap
torch
.
max
(
masked_heatmap
,
masked_gaussian
*
k
,
out
=
out_heatmap
[
y
-
top
:
y
+
bottom
,
x
-
left
:
x
+
right
])
return
out_heatmap
def
ellip_gaussian2D
(
radius
,
sigma_x
,
sigma_y
,
dtype
=
torch
.
float32
,
device
=
'cpu'
):
"""Generate 2D ellipse gaussian kernel.
Args:
radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
kernel.
sigma_x (int): X-axis sigma of gaussian function.
sigma_y (int): Y-axis sigma of gaussian function.
dtype (torch.dtype, optional): Dtype of gaussian tensor.
Default: torch.float32.
device (str, optional): Device of gaussian tensor.
Default: 'cpu'.
Returns:
h (Tensor): Gaussian kernel with a
``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
"""
x
=
torch
.
arange
(
-
radius
[
0
],
radius
[
0
]
+
1
,
dtype
=
dtype
,
device
=
device
).
view
(
1
,
-
1
)
y
=
torch
.
arange
(
-
radius
[
1
],
radius
[
1
]
+
1
,
dtype
=
dtype
,
device
=
device
).
view
(
-
1
,
1
)
h
=
(
-
(
x
*
x
)
/
(
2
*
sigma_x
*
sigma_x
)
-
(
y
*
y
)
/
(
2
*
sigma_y
*
sigma_y
)).
exp
()
h
[
h
<
torch
.
finfo
(
h
.
dtype
).
eps
*
h
.
max
()]
=
0
return
h
mmdet3d/models/dense_heads/__init__.py
View file @
8538177b
...
@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead
...
@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead
from
.fcos_mono3d_head
import
FCOSMono3DHead
from
.fcos_mono3d_head
import
FCOSMono3DHead
from
.free_anchor3d_head
import
FreeAnchor3DHead
from
.free_anchor3d_head
import
FreeAnchor3DHead
from
.groupfree3d_head
import
GroupFree3DHead
from
.groupfree3d_head
import
GroupFree3DHead
from
.monoflex_head
import
MonoFlexHead
from
.parta2_rpn_head
import
PartA2RPNHead
from
.parta2_rpn_head
import
PartA2RPNHead
from
.pgd_head
import
PGDHead
from
.pgd_head
import
PGDHead
from
.point_rpn_head
import
PointRPNHead
from
.point_rpn_head
import
PointRPNHead
...
@@ -19,5 +20,6 @@ __all__ = [
...
@@ -19,5 +20,6 @@ __all__ = [
'Anchor3DHead'
,
'FreeAnchor3DHead'
,
'PartA2RPNHead'
,
'VoteHead'
,
'Anchor3DHead'
,
'FreeAnchor3DHead'
,
'PartA2RPNHead'
,
'VoteHead'
,
'SSD3DHead'
,
'BaseConvBboxHead'
,
'CenterHead'
,
'ShapeAwareHead'
,
'SSD3DHead'
,
'BaseConvBboxHead'
,
'CenterHead'
,
'ShapeAwareHead'
,
'BaseMono3DDenseHead'
,
'AnchorFreeMono3DHead'
,
'FCOSMono3DHead'
,
'BaseMono3DDenseHead'
,
'AnchorFreeMono3DHead'
,
'FCOSMono3DHead'
,
'GroupFree3DHead'
,
'PointRPNHead'
,
'SMOKEMono3DHead'
,
'PGDHead'
'GroupFree3DHead'
,
'PointRPNHead'
,
'SMOKEMono3DHead'
,
'PGDHead'
,
'MonoFlexHead'
]
]
mmdet3d/models/dense_heads/monoflex_head.py
0 → 100644
View file @
8538177b
import
torch
from
mmcv.cnn
import
xavier_init
from
torch
import
nn
as
nn
from
mmdet3d.core.utils
import
get_ellip_gaussian_2D
from
mmdet3d.models.model_utils
import
EdgeFusionModule
from
mmdet3d.models.utils
import
(
filter_outside_objs
,
get_edge_indices
,
get_keypoints
,
handle_proj_objs
)
from
mmdet.core
import
multi_apply
from
mmdet.core.bbox.builder
import
build_bbox_coder
from
mmdet.models.builder
import
HEADS
,
build_loss
from
mmdet.models.utils
import
gaussian_radius
,
gen_gaussian_target
from
mmdet.models.utils.gaussian_target
import
(
get_local_maximum
,
get_topk_from_heatmap
,
transpose_and_gather_feat
)
from
.anchor_free_mono3d_head
import
AnchorFreeMono3DHead
@
HEADS
.
register_module
()
class
MonoFlexHead
(
AnchorFreeMono3DHead
):
r
"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
.. code-block:: none
/ --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
|
| --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
|
| --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty
feature
| --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty
|
| --> 3 x 3 conv --> 1 x 1 conv --> 3d dimensions
|
| |--- 1 x 1 conv --> ori cls
| --> 3 x 3 conv --|
| |--- 1 x 1 conv --> ori offsets
|
| --> 3 x 3 conv --> 1 x 1 conv --> depth
|
\ --> 3 x 3 conv --> 1 x 1 conv --> depth uncertainty
Args:
use_edge_fusion (bool): Whether to use edge fusion module while
feature extraction.
edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
edge_heatmap_ratio (float): Ratio of generating target heatmap.
filter_outside_objs (bool, optional): Whether to filter the
outside objects. Default: True.
loss_cls (dict, optional): Config of classification loss.
Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
loss_bbox (dict, optional): Config of localization loss.
Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
loss_dir (dict, optional): Config of direction classification loss.
Default: dict(type='MultibinLoss', loss_weight=0.1).
loss_keypoints (dict, optional): Config of keypoints loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_dims: (dict, optional): Config of dimensions loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_offsets2d: (dict, optional): Config of offsets2d loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_direct_depth: (dict, optional): Config of directly regression depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_combined_depth: (dict, optional): Config of combined depth loss.
Default: dict(type='L1Loss', loss_weight=0.1).
loss_attr (dict, optional): Config of attribute classification loss.
In MonoFlex, Default: None.
bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
Default: dict(type='MonoFlexCoder', code_size=7).
norm_cfg (dict, optional): Dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
init_cfg (dict): Initialization config dict. Default: None.
"""
# noqa: E501
def
__init__
(
self
,
num_classes
,
in_channels
,
use_edge_fusion
,
edge_fusion_inds
,
edge_heatmap_ratio
,
filter_outside_objs
=
True
,
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'IoULoss'
,
loss_weight
=
0.1
),
loss_dir
=
dict
(
type
=
'MultiBinLoss'
,
loss_weight
=
0.1
),
loss_keypoints
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_dims
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_offsets2d
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_direct_depth
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_keypoints_depth
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_combined_depth
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.1
),
loss_attr
=
None
,
bbox_coder
=
dict
(
type
=
'MonoFlexCoder'
,
code_size
=
7
),
norm_cfg
=
dict
(
type
=
'BN'
),
init_cfg
=
None
,
init_bias
=-
2.19
,
**
kwargs
):
self
.
use_edge_fusion
=
use_edge_fusion
self
.
edge_fusion_inds
=
edge_fusion_inds
super
().
__init__
(
num_classes
,
in_channels
,
loss_cls
=
loss_cls
,
loss_bbox
=
loss_bbox
,
loss_dir
=
loss_dir
,
loss_attr
=
loss_attr
,
norm_cfg
=
norm_cfg
,
init_cfg
=
init_cfg
,
**
kwargs
)
self
.
filter_outside_objs
=
filter_outside_objs
self
.
edge_heatmap_ratio
=
edge_heatmap_ratio
self
.
init_bias
=
init_bias
self
.
loss_dir
=
build_loss
(
loss_dir
)
self
.
loss_keypoints
=
build_loss
(
loss_keypoints
)
self
.
loss_dims
=
build_loss
(
loss_dims
)
self
.
loss_offsets2d
=
build_loss
(
loss_offsets2d
)
self
.
loss_direct_depth
=
build_loss
(
loss_direct_depth
)
self
.
loss_keypoints_depth
=
build_loss
(
loss_keypoints_depth
)
self
.
loss_combined_depth
=
build_loss
(
loss_combined_depth
)
self
.
bbox_coder
=
build_bbox_coder
(
bbox_coder
)
def
_init_edge_module
(
self
):
"""Initialize edge fusion module for feature extraction."""
self
.
edge_fuse_cls
=
EdgeFusionModule
(
self
.
num_classes
,
256
)
for
i
in
range
(
len
(
self
.
edge_fusion_inds
)):
reg_inds
,
out_inds
=
self
.
edge_fusion_inds
[
i
]
out_channels
=
self
.
group_reg_dims
[
reg_inds
][
out_inds
]
fusion_layer
=
EdgeFusionModule
(
out_channels
,
256
)
layer_name
=
f
'edge_fuse_reg_
{
reg_inds
}
_
{
out_inds
}
'
self
.
add_module
(
layer_name
,
fusion_layer
)
def
init_weights
(
self
):
"""Initialize weights."""
super
().
init_weights
()
self
.
conv_cls
.
bias
.
data
.
fill_
(
self
.
init_bias
)
xavier_init
(
self
.
conv_regs
[
4
][
0
],
gain
=
0.01
)
xavier_init
(
self
.
conv_regs
[
7
][
0
],
gain
=
0.01
)
for
m
in
self
.
conv_regs
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
if
m
.
bias
is
not
None
:
nn
.
init
.
constant_
(
m
.
bias
,
0
)
def
_init_predictor
(
self
):
"""Initialize predictor layers of the head."""
self
.
conv_cls_prev
=
self
.
_init_branch
(
conv_channels
=
self
.
cls_branch
,
conv_strides
=
(
1
,
)
*
len
(
self
.
cls_branch
))
self
.
conv_cls
=
nn
.
Conv2d
(
self
.
cls_branch
[
-
1
],
self
.
cls_out_channels
,
1
)
# init regression head
self
.
conv_reg_prevs
=
nn
.
ModuleList
()
# init output head
self
.
conv_regs
=
nn
.
ModuleList
()
# group_reg_dims:
# ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
for
i
in
range
(
len
(
self
.
group_reg_dims
)):
reg_dims
=
self
.
group_reg_dims
[
i
]
reg_branch_channels
=
self
.
reg_branch
[
i
]
out_channel
=
self
.
out_channels
[
i
]
reg_list
=
nn
.
ModuleList
()
if
len
(
reg_branch_channels
)
>
0
:
self
.
conv_reg_prevs
.
append
(
self
.
_init_branch
(
conv_channels
=
reg_branch_channels
,
conv_strides
=
(
1
,
)
*
len
(
reg_branch_channels
)))
for
reg_dim
in
reg_dims
:
reg_list
.
append
(
nn
.
Conv2d
(
out_channel
,
reg_dim
,
1
))
self
.
conv_regs
.
append
(
reg_list
)
else
:
self
.
conv_reg_prevs
.
append
(
None
)
for
reg_dim
in
reg_dims
:
reg_list
.
append
(
nn
.
Conv2d
(
self
.
feat_channels
,
reg_dim
,
1
))
self
.
conv_regs
.
append
(
reg_list
)
def
_init_layers
(
self
):
"""Initialize layers of the head."""
self
.
_init_predictor
()
if
self
.
use_edge_fusion
:
self
.
_init_edge_module
()
def
forward_train
(
self
,
x
,
input_metas
,
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
attr_labels
,
gt_bboxes_ignore
,
proposal_cfg
,
**
kwargs
):
"""
Args:
x (list[Tensor]): Features from FPN.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
shape (num_gts, self.bbox_code_size).
gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
shape (num_gts,).
centers2d (list[Tensor]): Projected 3D center of each box,
shape (num_gts, 2).
depths (list[Tensor]): Depth of projected 3D center of each box,
shape (num_gts,).
attr_labels (list[Tensor]): Attribute labels of each box,
shape (num_gts,).
gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used
Returns:
tuple:
losses: (dict[str, Tensor]): A dictionary of loss components.
proposal_list (list[Tensor]): Proposals of each image.
"""
outs
=
self
(
x
,
input_metas
)
if
gt_labels
is
None
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_bboxes_3d
,
centers2d
,
depths
,
attr_labels
,
input_metas
)
else
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
attr_labels
,
input_metas
)
losses
=
self
.
loss
(
*
loss_inputs
,
gt_bboxes_ignore
=
gt_bboxes_ignore
)
if
proposal_cfg
is
None
:
return
losses
else
:
proposal_list
=
self
.
get_bboxes
(
*
outs
,
input_metas
,
cfg
=
proposal_cfg
)
return
losses
,
proposal_list
def
forward
(
self
,
feats
,
input_metas
):
"""Forward features from the upstream network.
Args:
feats (list[Tensor]): Features from the upstream network, each is
a 4D-tensor.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * bbox_code_size.
"""
mlvl_input_metas
=
[
input_metas
for
i
in
range
(
len
(
feats
))]
return
multi_apply
(
self
.
forward_single
,
feats
,
mlvl_input_metas
)
def
forward_single
(
self
,
x
,
input_metas
):
"""Forward features of a single scale level.
Args:
x (Tensor): Feature maps from a specific FPN feature level.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple: Scores for each class, bbox predictions.
"""
img_h
,
img_w
=
input_metas
[
0
][
'pad_shape'
][:
2
]
batch_size
,
_
,
feat_h
,
feat_w
=
x
.
shape
downsample_ratio
=
img_h
/
feat_h
for
conv_cls_prev_layer
in
self
.
conv_cls_prev
:
cls_feat
=
conv_cls_prev_layer
(
x
)
out_cls
=
self
.
conv_cls
(
cls_feat
)
if
self
.
use_edge_fusion
:
# calculate the edge indices for the batch data
edge_indices_list
=
get_edge_indices
(
input_metas
,
downsample_ratio
,
device
=
x
.
device
)
edge_lens
=
[
edge_indices
.
shape
[
0
]
for
edge_indices
in
edge_indices_list
]
max_edge_len
=
max
(
edge_lens
)
edge_indices
=
x
.
new_zeros
((
batch_size
,
max_edge_len
,
2
),
dtype
=
torch
.
long
)
for
i
in
range
(
batch_size
):
edge_indices
[
i
,
:
edge_lens
[
i
]]
=
edge_indices_list
[
i
]
# cls feature map edge fusion
out_cls
=
self
.
edge_fuse_cls
(
cls_feat
,
out_cls
,
edge_indices
,
edge_lens
,
feat_h
,
feat_w
)
bbox_pred
=
[]
for
i
in
range
(
len
(
self
.
group_reg_dims
)):
reg_feat
=
x
.
clone
()
# feature regression head
if
len
(
self
.
reg_branch
[
i
])
>
0
:
for
conv_reg_prev_layer
in
self
.
conv_reg_prevs
[
i
]:
reg_feat
=
conv_reg_prev_layer
(
reg_feat
)
for
j
,
conv_reg
in
enumerate
(
self
.
conv_regs
[
i
]):
out_reg
=
conv_reg
(
reg_feat
)
# Use Edge Fusion Module
if
self
.
use_edge_fusion
and
(
i
,
j
)
in
self
.
edge_fusion_inds
:
# reg feature map edge fusion
out_reg
=
getattr
(
self
,
'edge_fuse_reg_{}_{}'
.
format
(
i
,
j
))(
reg_feat
,
out_reg
,
edge_indices
,
edge_lens
,
feat_h
,
feat_w
)
bbox_pred
.
append
(
out_reg
)
bbox_pred
=
torch
.
cat
(
bbox_pred
,
dim
=
1
)
cls_score
=
out_cls
.
sigmoid
()
# turn to 0-1
cls_score
=
cls_score
.
clamp
(
min
=
1e-4
,
max
=
1
-
1e-4
)
return
cls_score
,
bbox_pred
def
get_bboxes
(
self
,
cls_scores
,
bbox_preds
,
input_metas
):
"""Generate bboxes from bbox head predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
bbox_preds (list[Tensor]): Box regression for each scale.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If True, return boxes in original image space.
Returns:
list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
Each item in result_list is 4-tuple.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
1
cam2imgs
=
torch
.
stack
([
cls_scores
[
0
].
new_tensor
(
input_meta
[
'cam2img'
])
for
input_meta
in
input_metas
])
batch_bboxes
,
batch_scores
,
batch_topk_labels
=
self
.
decode_heatmap
(
cls_scores
[
0
],
bbox_preds
[
0
],
input_metas
,
cam2imgs
=
cam2imgs
,
topk
=
100
,
kernel
=
3
)
result_list
=
[]
for
img_id
in
range
(
len
(
input_metas
)):
bboxes
=
batch_bboxes
[
img_id
]
scores
=
batch_scores
[
img_id
]
labels
=
batch_topk_labels
[
img_id
]
keep_idx
=
scores
>
0.25
bboxes
=
bboxes
[
keep_idx
]
scores
=
scores
[
keep_idx
]
labels
=
labels
[
keep_idx
]
bboxes
=
input_metas
[
img_id
][
'box_type_3d'
](
bboxes
,
box_dim
=
self
.
bbox_code_size
,
origin
=
(
0.5
,
0.5
,
0.5
))
attrs
=
None
result_list
.
append
((
bboxes
,
scores
,
labels
,
attrs
))
return
result_list
def
decode_heatmap
(
self
,
cls_score
,
reg_pred
,
input_metas
,
cam2imgs
,
topk
=
100
,
kernel
=
3
):
"""Transform outputs into detections raw bbox predictions.
Args:
class_score (Tensor): Center predict heatmap,
shape (B, num_classes, H, W).
reg_pred (Tensor): Box regression map.
shape (B, channel, H , W).
input_metas (List[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cam2imgs (Tensor): Camera intrinsic matrix.
shape (N, 4, 4)
topk (int, optional): Get top k center keypoints from heatmap.
Default 100.
kernel (int, optional): Max pooling kernel for extract local
maximum pixels. Default 3.
Returns:
tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
the following Tensors:
- batch_bboxes (Tensor): Coords of each 3D box.
shape (B, k, 7)
- batch_scores (Tensor): Scores of each 3D box.
shape (B, k)
- batch_topk_labels (Tensor): Categories of each 3D box.
shape (B, k)
"""
img_h
,
img_w
=
input_metas
[
0
][
'pad_shape'
][:
2
]
batch_size
,
_
,
feat_h
,
feat_w
=
cls_score
.
shape
downsample_ratio
=
img_h
/
feat_h
center_heatmap_pred
=
get_local_maximum
(
cls_score
,
kernel
=
kernel
)
*
batch_dets
,
topk_ys
,
topk_xs
=
get_topk_from_heatmap
(
center_heatmap_pred
,
k
=
topk
)
batch_scores
,
batch_index
,
batch_topk_labels
=
batch_dets
regression
=
transpose_and_gather_feat
(
reg_pred
,
batch_index
)
regression
=
regression
.
view
(
-
1
,
8
)
pred_base_centers2d
=
torch
.
cat
(
[
topk_xs
.
view
(
-
1
,
1
),
topk_ys
.
view
(
-
1
,
1
).
float
()],
dim
=
1
)
preds
=
self
.
bbox_coder
.
decode
(
regression
,
batch_topk_labels
,
downsample_ratio
,
cam2imgs
)
pred_locations
=
self
.
bbox_coder
.
decode_location
(
pred_base_centers2d
,
preds
[
'offsets2d'
],
preds
[
'combined_depth'
],
cam2imgs
,
downsample_ratio
)
pred_yaws
=
self
.
bbox_coder
.
decode_orientation
(
preds
[
'orientations'
]).
unsqueeze
(
-
1
)
pred_dims
=
preds
[
'dimensions'
]
batch_bboxes
=
torch
.
cat
((
pred_locations
,
pred_dims
,
pred_yaws
),
dim
=
1
)
batch_bboxes
=
batch_bboxes
.
view
(
batch_size
,
-
1
,
self
.
bbox_code_size
)
return
batch_bboxes
,
batch_scores
,
batch_topk_labels
def
get_predictions
(
self
,
pred_reg
,
labels3d
,
centers2d
,
reg_mask
,
batch_indices
,
input_metas
,
downsample_ratio
):
"""Prepare predictions for computing loss.
Args:
pred_reg (Tensor): Box regression map.
shape (B, channel, H , W).
labels3d (Tensor): Labels of each 3D box.
shape (B * max_objs, )
centers2d (Tensor): Coords of each projected 3D box
center on image. shape (N, 2)
reg_mask (Tensor): Indexes of the existence of the 3D box.
shape (B * max_objs, )
batch_indices (Tenosr): Batch indices of the 3D box.
shape (N, 3)
input_metas (list[dict]): Meta information of each image,
e.g., image size, scaling factor, etc.
downsample_ratio (int): The stride of feature map.
Returns:
dict: The predictions for computing loss.
"""
batch
,
channel
=
pred_reg
.
shape
[
0
],
pred_reg
.
shape
[
1
]
w
=
pred_reg
.
shape
[
3
]
cam2imgs
=
torch
.
stack
([
centers2d
.
new_tensor
(
input_meta
[
'cam2img'
])
for
input_meta
in
input_metas
])
# (batch_size, 4, 4) -> (N, 4, 4)
cam2imgs
=
cam2imgs
[
batch_indices
,
:,
:]
centers2d_inds
=
centers2d
[:,
1
]
*
w
+
centers2d
[:,
0
]
centers2d_inds
=
centers2d_inds
.
view
(
batch
,
-
1
)
pred_regression
=
transpose_and_gather_feat
(
pred_reg
,
centers2d_inds
)
pred_regression_pois
=
pred_regression
.
view
(
-
1
,
channel
)[
reg_mask
]
preds
=
self
.
bbox_coder
.
decode
(
pred_regression_pois
,
labels3d
,
downsample_ratio
,
cam2imgs
)
return
preds
def
get_targets
(
self
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_3d_list
,
gt_labels_3d_list
,
centers2d_list
,
depths_list
,
feat_shape
,
img_shape
,
input_metas
):
"""Get training targets for batch images.
``
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
image, shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each
box, shape (num_gt,).
gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
Ground truth bboxes of each image,
shape (num_gt, bbox_code_size).
gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
each box, shape (num_gt,).
centers2d_list (list[Tensor]): Projected 3D centers onto 2D
image, shape (num_gt, 2).
depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
image, each has shape (num_gt, 1).
feat_shape (tuple[int]): Feature map shape with value,
shape (B, _, H, W).
img_shape (tuple[int]): Image shape in [h, w] format.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple[Tensor, dict]: The Tensor value is the targets of
center heatmap, the dict has components below:
- base_centers2d_target (Tensor): Coords of each projected 3D box
center on image. shape (B * max_objs, 2), [dtype: int]
- labels3d (Tensor): Labels of each 3D box.
shape (N, )
- reg_mask (Tensor): Mask of the existence of the 3D box.
shape (B * max_objs, )
- batch_indices (Tensor): Batch id of the 3D box.
shape (N, )
- depth_target (Tensor): Depth target of each 3D box.
shape (N, )
- keypoints2d_target (Tensor): Keypoints of each projected 3D box
on image. shape (N, 10, 2)
- keypoints_mask (Tensor): Keypoints mask of each projected 3D
box on image. shape (N, 10)
- keypoints_depth_mask (Tensor): Depths decoded from keypoints
of each 3D box. shape (N, 3)
- orientations_target (Tensor): Orientation (encoded local yaw)
target of each 3D box. shape (N, )
- offsets2d_target (Tensor): Offsets target of each projected
3D box. shape (N, 2)
- dimensions_target (Tensor): Dimensions target of each 3D box.
shape (N, 3)
- downsample_ratio (int): The stride of feature map.
"""
img_h
,
img_w
=
img_shape
[:
2
]
batch_size
,
_
,
feat_h
,
feat_w
=
feat_shape
width_ratio
=
float
(
feat_w
/
img_w
)
# 1/4
height_ratio
=
float
(
feat_h
/
img_h
)
# 1/4
assert
width_ratio
==
height_ratio
# Whether to filter the objects which are not in FOV.
if
self
.
filter_outside_objs
:
filter_outside_objs
(
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_3d_list
,
gt_labels_3d_list
,
centers2d_list
,
input_metas
)
# transform centers2d to base centers2d for regression and
# heatmap generation.
# centers2d = int(base_centers2d) + offsets2d
base_centers2d_list
,
offsets2d_list
,
trunc_mask_list
=
\
handle_proj_objs
(
centers2d_list
,
gt_bboxes_list
,
input_metas
)
keypoints2d_list
,
keypoints_mask_list
,
keypoints_depth_mask_list
=
\
get_keypoints
(
gt_bboxes_3d_list
,
centers2d_list
,
input_metas
)
center_heatmap_target
=
gt_bboxes_list
[
-
1
].
new_zeros
(
[
batch_size
,
self
.
num_classes
,
feat_h
,
feat_w
])
for
batch_id
in
range
(
batch_size
):
# project gt_bboxes from input image to feat map
gt_bboxes
=
gt_bboxes_list
[
batch_id
]
*
width_ratio
gt_labels
=
gt_labels_list
[
batch_id
]
# project base centers2d from input image to feat map
gt_base_centers2d
=
base_centers2d_list
[
batch_id
]
*
width_ratio
trunc_masks
=
trunc_mask_list
[
batch_id
]
for
j
,
base_center2d
in
enumerate
(
gt_base_centers2d
):
if
trunc_masks
[
j
]:
# for outside objects, generate ellipse heatmap
base_center2d_x_int
,
base_center2d_y_int
=
\
base_center2d
.
int
()
scale_box_w
=
min
(
base_center2d_x_int
-
gt_bboxes
[
j
][
0
],
gt_bboxes
[
j
][
2
]
-
base_center2d_x_int
)
scale_box_h
=
min
(
base_center2d_y_int
-
gt_bboxes
[
j
][
1
],
gt_bboxes
[
j
][
3
]
-
base_center2d_y_int
)
radius_x
=
scale_box_w
*
self
.
edge_heatmap_ratio
radius_y
=
scale_box_h
*
self
.
edge_heatmap_ratio
radius_x
,
radius_y
=
max
(
0
,
int
(
radius_x
)),
max
(
0
,
int
(
radius_y
))
assert
min
(
radius_x
,
radius_y
)
==
0
ind
=
gt_labels
[
j
]
get_ellip_gaussian_2D
(
center_heatmap_target
[
batch_id
,
ind
],
[
base_center2d_x_int
,
base_center2d_y_int
],
radius_x
,
radius_y
)
else
:
base_center2d_x_int
,
base_center2d_y_int
=
\
base_center2d
.
int
()
scale_box_h
=
(
gt_bboxes
[
j
][
3
]
-
gt_bboxes
[
j
][
1
])
scale_box_w
=
(
gt_bboxes
[
j
][
2
]
-
gt_bboxes
[
j
][
0
])
radius
=
gaussian_radius
([
scale_box_h
,
scale_box_w
],
min_overlap
=
0.7
)
radius
=
max
(
0
,
int
(
radius
))
ind
=
gt_labels
[
j
]
gen_gaussian_target
(
center_heatmap_target
[
batch_id
,
ind
],
[
base_center2d_x_int
,
base_center2d_y_int
],
radius
)
avg_factor
=
max
(
1
,
center_heatmap_target
.
eq
(
1
).
sum
())
num_ctrs
=
[
centers2d
.
shape
[
0
]
for
centers2d
in
centers2d_list
]
max_objs
=
max
(
num_ctrs
)
batch_indices
=
[
centers2d_list
[
0
].
new_full
((
num_ctrs
[
i
],
),
i
)
for
i
in
range
(
batch_size
)
]
batch_indices
=
torch
.
cat
(
batch_indices
,
dim
=
0
)
reg_mask
=
torch
.
zeros
(
(
batch_size
,
max_objs
),
dtype
=
torch
.
bool
).
to
(
base_centers2d_list
[
0
].
device
)
gt_bboxes_3d
=
input_metas
[
'box_type_3d'
].
cat
(
gt_bboxes_3d_list
)
gt_bboxes_3d
=
gt_bboxes_3d
.
to
(
base_centers2d_list
[
0
].
device
)
# encode original local yaw to multibin format
orienations_target
=
self
.
bbox_coder
.
encode
(
gt_bboxes_3d
)
batch_base_centers2d
=
base_centers2d_list
[
0
].
new_zeros
(
(
batch_size
,
max_objs
,
2
))
for
i
in
range
(
batch_size
):
reg_mask
[
i
,
:
num_ctrs
[
i
]]
=
1
batch_base_centers2d
[
i
,
:
num_ctrs
[
i
]]
=
base_centers2d_list
[
i
]
flatten_reg_mask
=
reg_mask
.
flatten
()
# transform base centers2d from input scale to output scale
batch_base_centers2d
=
batch_base_centers2d
.
view
(
-
1
,
2
)
*
width_ratio
dimensions_target
=
gt_bboxes_3d
.
tensor
[:,
3
:
6
]
labels_3d
=
torch
.
cat
(
gt_labels_3d_list
)
keypoints2d_target
=
torch
.
cat
(
keypoints2d_list
)
keypoints_mask
=
torch
.
cat
(
keypoints_mask_list
)
keypoints_depth_mask
=
torch
.
cat
(
keypoints_depth_mask_list
)
offsets2d_target
=
torch
.
cat
(
offsets2d_list
)
bboxes2d
=
torch
.
cat
(
gt_bboxes_list
)
# transform FCOS style bbox into [x1, y1, x2, y2] format.
bboxes2d_target
=
torch
.
cat
([
bboxes2d
[:,
0
:
2
]
*
-
1
,
bboxes2d
[:,
2
:]],
dim
=-
1
)
depths
=
torch
.
cat
(
depths_list
)
target_labels
=
dict
(
base_centers2d_target
=
batch_base_centers2d
.
int
(),
labels3d
=
labels_3d
,
reg_mask
=
flatten_reg_mask
,
batch_indices
=
batch_indices
,
bboxes2d_target
=
bboxes2d_target
,
depth_target
=
depths
,
keypoints2d_target
=
keypoints2d_target
,
keypoints_mask
=
keypoints_mask
,
keypoints_depth_mask
=
keypoints_depth_mask
,
orienations_target
=
orienations_target
,
offsets2d_target
=
offsets2d_target
,
dimensions_target
=
dimensions_target
,
downsample_ratio
=
1
/
width_ratio
)
return
center_heatmap_target
,
avg_factor
,
target_labels
def
loss
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
attr_labels
,
input_metas
,
gt_bboxes_ignore
=
None
):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level.
shape (num_gt, 4).
bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
number is bbox_code_size.
shape (B, 7, H, W).
gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box.
shape (num_gts, ).
gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
truth. it is the flipped gt_bboxes
gt_labels_3d (list[Tensor]): Same as gt_labels.
centers2d (list[Tensor]): 2D centers on the image.
shape (num_gts, 2).
depths (list[Tensor]): Depth ground truth.
shape (num_gts, ).
attr_labels (list[Tensor]): Attributes indices of each box.
In kitti it's None.
input_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.
Default: None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
1
assert
attr_labels
is
None
assert
gt_bboxes_ignore
is
None
center2d_heatmap
=
cls_scores
[
0
]
pred_reg
=
bbox_preds
[
0
]
center2d_heatmap_target
,
avg_factor
,
target_labels
=
\
self
.
get_targets
(
gt_bboxes
,
gt_labels
,
gt_bboxes_3d
,
gt_labels_3d
,
centers2d
,
depths
,
center2d_heatmap
.
shape
,
input_metas
[
0
][
'pad_shape'
],
input_metas
)
preds
=
self
.
get_predictions
(
pred_reg
=
pred_reg
,
labels3d
=
target_labels
[
'labels3d'
],
centers2d
=
target_labels
[
'base_centers2d_target'
],
reg_mask
=
target_labels
[
'reg_mask'
],
batch_indices
=
target_labels
[
'batch_indices'
],
input_metas
=
input_metas
,
downsample_ratio
=
target_labels
[
'downsample_ratio'
])
# heatmap loss
loss_cls
=
self
.
loss_cls
(
center2d_heatmap
,
center2d_heatmap_target
,
avg_factor
=
avg_factor
)
# bbox2d regression loss
loss_bbox
=
self
.
loss_bbox
(
preds
[
'bboxes2d'
],
target_labels
[
'bboxes2d_target'
])
# keypoints loss, the keypoints in predictions and target are all
# local coordinates. Check the mask dtype should be bool, not int
# or float to ensure the indexing is bool index
keypoints2d_mask
=
target_labels
[
'keypoints2d_mask'
]
loss_keypoints
=
self
.
loss_keypoints
(
preds
[
'keypoints2d'
][
keypoints2d_mask
],
target_labels
[
'keypoints2d_target'
][
keypoints2d_mask
])
# orientations loss
loss_dir
=
self
.
loss_dir
(
preds
[
'orientations'
],
target_labels
[
'orientations_target'
])
# dimensions loss
loss_dims
=
self
.
loss_dims
(
preds
[
'dimensions'
],
target_labels
[
'dimensions_target'
])
# offsets for center heatmap
loss_offsets2d
=
self
.
loss_offsets2d
(
preds
[
'offsets2d'
],
target_labels
[
'offsets2d_target'
])
# directly regressed depth loss with direct depth uncertainty loss
direct_depth_weights
=
torch
.
exp
(
-
preds
[
'direct_depth_uncertainty'
])
loss_weight_1
=
self
.
loss_direct_depth
.
loss_weight
loss_direct_depth
=
self
.
loss_direct_depth
(
preds
[
'direct_depth'
],
target_labels
[
'depth_target'
],
direct_depth_weights
)
loss_uncertainty_1
=
\
preds
[
'direct_depth_uncertainty'
]
*
loss_weight_1
loss_direct_depth
=
loss_direct_depth
+
loss_uncertainty_1
.
mean
()
# keypoints decoded depth loss with keypoints depth uncertainty loss
depth_mask
=
target_labels
[
'keypoints_depth_mask'
]
depth_target
=
target_labels
[
'depth_target'
].
unsqueeze
(
-
1
).
repeat
(
1
,
3
)
valid_keypoints_depth_uncertainty
=
preds
[
'keypoints_depth_uncertainty'
][
depth_mask
]
valid_keypoints_depth_weights
=
torch
.
exp
(
-
valid_keypoints_depth_uncertainty
)
loss_keypoints_depth
=
self
.
loss_keypoint_depth
(
preds
[
'keypoints_depth'
][
depth_mask
],
depth_target
[
depth_mask
],
valid_keypoints_depth_weights
)
loss_weight_2
=
self
.
loss_keypoints_depth
.
loss_weight
loss_uncertainty_2
=
\
valid_keypoints_depth_uncertainty
*
loss_weight_2
loss_keypoints_depth
=
loss_keypoints_depth
+
loss_uncertainty_2
.
mean
()
# combined depth loss for optimiaze the uncertainty
loss_combined_depth
=
self
.
loss_combined_depth
(
preds
[
'combined_depth'
],
target_labels
[
'depth_target'
])
loss_dict
=
dict
(
loss_cls
=
loss_cls
,
loss_bbox
=
loss_bbox
,
loss_keypoints
=
loss_keypoints
,
loss_dir
=
loss_dir
,
loss_dims
=
loss_dims
,
loss_offsets2d
=
loss_offsets2d
,
loss_direct_depth
=
loss_direct_depth
,
loss_keypoints_depth
=
loss_keypoints_depth
,
loss_combined_depth
=
loss_combined_depth
)
return
loss_dict
mmdet3d/models/model_utils/__init__.py
View file @
8538177b
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
from
.edge_fusion_module
import
EdgeFusionModule
from
.transformer
import
GroupFree3DMHA
from
.transformer
import
GroupFree3DMHA
from
.vote_module
import
VoteModule
from
.vote_module
import
VoteModule
__all__
=
[
'VoteModule'
,
'GroupFree3DMHA'
]
__all__
=
[
'VoteModule'
,
'GroupFree3DMHA'
,
'EdgeFusionModule'
]
mmdet3d/models/model_utils/edge_fusion_module.py
0 → 100644
View file @
8538177b
from
mmcv.cnn
import
ConvModule
from
mmcv.runner
import
BaseModule
from
torch
import
nn
as
nn
from
torch.nn
import
functional
as
F
class
EdgeFusionModule
(
BaseModule
):
"""Edge Fusion Module for feature map.
Args:
out_channels (int): The number of output channels.
feat_channels (int): The number of channels in feature map
during edge feature fusion.
kernel_size (int, optional): Kernel size of convolution.
Default: 3.
act_cfg (dict, optional): Config of activation.
Default: dict(type='ReLU').
norm_cfg (dict, optional): Config of normalization.
Default: dict(type='BN1d')).
"""
def
__init__
(
self
,
out_channels
,
feat_channels
,
kernel_size
=
3
,
act_cfg
=
dict
(
type
=
'ReLU'
),
norm_cfg
=
dict
(
type
=
'BN1d'
)):
super
().
__init__
()
self
.
edge_convs
=
nn
.
Sequential
(
ConvModule
(
feat_channels
,
feat_channels
,
kernel_size
=
kernel_size
,
padding
=
kernel_size
//
2
,
conv_cfg
=
dict
(
type
=
'Conv1d'
),
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
),
nn
.
Conv1d
(
feat_channels
,
out_channels
,
kernel_size
=
1
))
self
.
feat_channels
=
feat_channels
def
forward
(
self
,
features
,
fused_features
,
edge_indices
,
edge_lens
,
output_h
,
output_w
):
"""Forward pass.
Args:
features (torch.Tensor): Different representative features
for fusion.
fused_features (torch.Tensor): Different representative
features to be fused.
edge_indices (torch.Tensor): Batch image edge indices.
edge_lens (list[int]): List of edge length of each image.
output_h (int): Height of output feature map.
output_w (int): Width of output feature map.
Returns:
torch.Tensor: Fused feature maps.
"""
batch_size
=
features
.
shape
[
0
]
# normalize
grid_edge_indices
=
edge_indices
.
view
(
batch_size
,
-
1
,
1
,
2
).
float
()
grid_edge_indices
[...,
0
]
=
\
grid_edge_indices
[...,
0
]
/
(
output_w
-
1
)
*
2
-
1
grid_edge_indices
[...,
1
]
=
\
grid_edge_indices
[...,
1
]
/
(
output_h
-
1
)
*
2
-
1
# apply edge fusion
edge_features
=
F
.
grid_sample
(
features
,
grid_edge_indices
,
align_corners
=
True
).
squeeze
(
-
1
)
edge_output
=
self
.
edge_convs
(
edge_features
)
for
k
in
range
(
batch_size
):
edge_indice_k
=
edge_indices
[
k
,
:
edge_lens
[
k
]]
fused_features
[
k
,
:,
edge_indice_k
[:,
1
],
edge_indice_k
[:,
0
]]
+=
edge_output
[
k
,
:,
:
edge_lens
[
k
]]
return
fused_features
mmdet3d/models/utils/edge_indices.py
View file @
8538177b
...
@@ -4,6 +4,7 @@ import torch
...
@@ -4,6 +4,7 @@ import torch
def
get_edge_indices
(
img_metas
,
def
get_edge_indices
(
img_metas
,
downsample_ratio
,
step
=
1
,
step
=
1
,
pad_mode
=
'default'
,
pad_mode
=
'default'
,
dtype
=
np
.
float32
,
dtype
=
np
.
float32
,
...
@@ -17,6 +18,7 @@ def get_edge_indices(img_metas,
...
@@ -17,6 +18,7 @@ def get_edge_indices(img_metas,
Args:
Args:
img_metas (list[dict]): Meta information of each image, e.g.,
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
image size, scaling factor, etc.
downsample_ratio (int): Downsample ratio of output feature,
step (int, optional): Step size used for generateing
step (int, optional): Step size used for generateing
edge indices. Default: 1.
edge indices. Default: 1.
pad_mode (str, optional): Padding mode during data pipeline.
pad_mode (str, optional): Padding mode during data pipeline.
...
@@ -32,13 +34,21 @@ def get_edge_indices(img_metas,
...
@@ -32,13 +34,21 @@ def get_edge_indices(img_metas,
edge_indices_list
=
[]
edge_indices_list
=
[]
for
i
in
range
(
len
(
img_metas
)):
for
i
in
range
(
len
(
img_metas
)):
img_shape
=
img_metas
[
i
][
'img_shape'
]
img_shape
=
img_metas
[
i
][
'img_shape'
]
pad_shape
=
img_metas
[
i
][
'pad_shape'
]
h
,
w
=
img_shape
[:
2
]
h
,
w
=
img_shape
[:
2
]
pad_h
,
pad_w
=
pad_shape
edge_indices
=
[]
edge_indices
=
[]
if
pad_mode
==
'default'
:
if
pad_mode
==
'default'
:
x_min
=
0
x_min
=
0
y_min
=
0
y_min
=
0
x_max
,
y_max
=
w
-
1
,
h
-
1
x_max
=
(
w
-
1
)
//
downsample_ratio
y_max
=
(
h
-
1
)
//
downsample_ratio
elif
pad_mode
==
'center'
:
x_min
=
np
.
ceil
((
pad_w
-
w
)
/
2
*
downsample_ratio
)
y_min
=
np
.
ceil
((
pad_h
-
h
)
/
2
*
downsample_ratio
)
x_max
=
x_min
+
w
//
downsample_ratio
y_max
=
y_min
+
h
//
downsample_ratio
else
:
else
:
raise
NotImplementedError
raise
NotImplementedError
...
...
tests/test_models/test_heads/test_heads.py
View file @
8538177b
...
@@ -1505,3 +1505,62 @@ def test_pgd_head():
...
@@ -1505,3 +1505,62 @@ def test_pgd_head():
assert
results
[
0
][
2
].
shape
==
torch
.
Size
([
20
])
assert
results
[
0
][
2
].
shape
==
torch
.
Size
([
20
])
assert
results
[
0
][
3
]
is
None
assert
results
[
0
][
3
]
is
None
assert
results
[
0
][
4
].
shape
==
torch
.
Size
([
20
,
5
])
assert
results
[
0
][
4
].
shape
==
torch
.
Size
([
20
,
5
])
def
test_monoflex_head
():
head_cfg
=
dict
(
type
=
'MonoFlexHead'
,
num_classes
=
3
,
in_channels
=
64
,
use_edge_fusion
=
True
,
edge_fusion_inds
=
[(
1
,
0
)],
edge_heatmap_ratio
=
1
/
8
,
stacked_convs
=
0
,
feat_channels
=
64
,
use_direction_classifier
=
False
,
diff_rad_by_sin
=
False
,
pred_attrs
=
False
,
pred_velo
=
False
,
dir_offset
=
0
,
strides
=
None
,
group_reg_dims
=
((
4
,
),
(
2
,
),
(
20
,
),
(
3
,
),
(
3
,
),
(
8
,
8
),
(
1
,
),
(
1
,
)),
cls_branch
=
(
256
,
),
reg_branch
=
((
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
),
(
256
,
)),
num_attrs
=
0
,
bbox_code_size
=
7
,
dir_branch
=
(),
attr_branch
=
(),
bbox_coder
=
dict
(
type
=
'MonoFlexCoder'
,
depth_mode
=
'exp'
,
base_depth
=
(
26.494627
,
16.05988
),
depth_range
=
[
0.1
,
100
],
combine_depth
=
True
,
uncertainty_range
=
[
-
10
,
10
],
base_dims
=
((
3.8840
,
1.5261
,
1.6286
,
0.4259
,
0.1367
,
0.1022
),
(
0.8423
,
1.7607
,
0.6602
,
0.2349
,
0.1133
,
0.1427
),
(
1.7635
,
1.7372
,
0.5968
,
0.1766
,
0.0948
,
0.1242
)),
dims_mode
=
'linear'
,
multibin
=
True
,
num_dir_bins
=
4
,
bin_centers
=
[
0
,
np
.
pi
/
2
,
np
.
pi
,
-
np
.
pi
/
2
],
bin_margin
=
np
.
pi
/
6
,
code_size
=
7
),
conv_bias
=
True
,
dcn_on_last_conv
=
False
)
self
=
build_head
(
head_cfg
)
feats
=
[
torch
.
rand
([
2
,
64
,
32
,
32
],
dtype
=
torch
.
float32
)]
input_metas
=
[
dict
(
img_shape
=
(
110
,
110
),
pad_shape
=
(
128
,
128
)),
dict
(
img_shape
=
(
98
,
110
),
pad_shape
=
(
128
,
128
))
]
cls_score
,
out_reg
=
self
(
feats
,
input_metas
)
assert
cls_score
[
0
].
shape
==
torch
.
Size
([
2
,
3
,
32
,
32
])
assert
out_reg
[
0
].
shape
==
torch
.
Size
([
2
,
50
,
32
,
32
])
tests/test_utils/test_bbox_coders.py
View file @
8538177b
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
import
torch
import
torch
from
mmcv.cnn
import
Scale
from
mmcv.cnn
import
Scale
from
torch
import
nn
as
nn
from
torch
import
nn
as
nn
...
@@ -596,3 +597,69 @@ def test_smoke_bbox_coder():
...
@@ -596,3 +597,69 @@ def test_smoke_bbox_coder():
locations
=
torch
.
tensor
([[
15.
,
2.
,
1.
],
[
15.
,
2.
,
-
1.
]])
locations
=
torch
.
tensor
([[
15.
,
2.
,
1.
],
[
15.
,
2.
,
-
1.
]])
orientations
=
bbox_coder
.
_decode_orientation
(
ori_vector
,
locations
)
orientations
=
bbox_coder
.
_decode_orientation
(
ori_vector
,
locations
)
assert
orientations
.
shape
==
torch
.
Size
([
2
,
1
])
assert
orientations
.
shape
==
torch
.
Size
([
2
,
1
])
def
test_monoflex_bbox_coder
():
bbox_coder_cfg
=
dict
(
type
=
'MonoFlexCoder'
,
depth_mode
=
'exp'
,
base_depth
=
(
26.494627
,
16.05988
),
depth_range
=
[
0.1
,
100
],
combine_depth
=
True
,
uncertainty_range
=
[
-
10
,
10
],
base_dims
=
((
3.8840
,
1.5261
,
1.6286
,
0.4259
,
0.1367
,
0.1022
),
(
0.8423
,
1.7607
,
0.6602
,
0.2349
,
0.1133
,
0.1427
),
(
1.7635
,
1.7372
,
0.5968
,
0.1766
,
0.0948
,
0.1242
)),
dims_mode
=
'linear'
,
multibin
=
True
,
num_dir_bins
=
4
,
bin_centers
=
[
0
,
np
.
pi
/
2
,
np
.
pi
,
-
np
.
pi
/
2
],
bin_margin
=
np
.
pi
/
6
,
code_size
=
7
)
bbox_coder
=
build_bbox_coder
(
bbox_coder_cfg
)
gt_bboxes_3d
=
CameraInstance3DBoxes
(
torch
.
rand
([
6
,
7
]))
orientation_target
=
bbox_coder
.
encode
(
gt_bboxes_3d
)
assert
orientation_target
.
shape
==
torch
.
Size
([
6
,
8
])
regression
=
torch
.
rand
([
100
,
50
])
base_centers2d
=
torch
.
rand
([
100
,
2
])
labels
=
torch
.
ones
([
100
])
downsample_ratio
=
4
cam2imgs
=
torch
.
rand
([
100
,
4
,
4
])
preds
=
bbox_coder
.
decode
(
regression
,
base_centers2d
,
labels
,
downsample_ratio
,
cam2imgs
)
assert
preds
[
'bboxes2d'
].
shape
==
torch
.
Size
([
100
,
4
])
assert
preds
[
'dimensions'
].
shape
==
torch
.
Size
([
100
,
3
])
assert
preds
[
'offsets2d'
].
shape
==
torch
.
Size
([
100
,
2
])
assert
preds
[
'keypoints2d'
].
shape
==
torch
.
Size
([
100
,
10
,
2
])
assert
preds
[
'orientations'
].
shape
==
torch
.
Size
([
100
,
16
])
assert
preds
[
'direct_depth'
].
shape
==
torch
.
Size
([
100
,
])
assert
preds
[
'keypoints_depth'
].
shape
==
torch
.
Size
([
100
,
3
])
assert
preds
[
'combined_depth'
].
shape
==
torch
.
Size
([
100
,
])
assert
preds
[
'direct_depth_uncertainty'
].
shape
==
torch
.
Size
([
100
,
])
assert
preds
[
'keypoints_depth_uncertainty'
].
shape
==
torch
.
Size
([
100
,
3
])
offsets_2d
=
torch
.
randn
([
100
,
2
])
depths
=
torch
.
randn
([
100
,
])
locations
=
bbox_coder
.
decode_location
(
base_centers2d
,
offsets_2d
,
depths
,
cam2imgs
,
downsample_ratio
)
assert
locations
.
shape
==
torch
.
Size
([
100
,
3
])
orientations
=
torch
.
randn
([
100
,
16
])
yaws
,
local_yaws
=
bbox_coder
.
decode_orientation
(
orientations
,
locations
)
assert
yaws
.
shape
==
torch
.
Size
([
100
,
])
assert
local_yaws
.
shape
==
torch
.
Size
([
100
,
])
tests/test_utils/test_utils.py
View file @
8538177b
...
@@ -195,11 +195,15 @@ def test_points_img2cam():
...
@@ -195,11 +195,15 @@ def test_points_img2cam():
def
test_generate_edge_indices
():
def
test_generate_edge_indices
():
img_metas
=
[
dict
(
img_shape
=
[
300
,
400
]),
dict
(
img_shape
=
[
500
,
450
])]
input_metas
=
[
edge_indices_list
=
get_edge_indices
(
img_metas
)
dict
(
img_shape
=
(
110
,
110
),
pad_shape
=
(
128
,
128
)),
dict
(
img_shape
=
(
98
,
110
),
pad_shape
=
(
128
,
128
))
]
downsample_ratio
=
4
edge_indices_list
=
get_edge_indices
(
input_metas
,
downsample_ratio
)
assert
edge_indices_list
[
0
].
shape
[
0
]
==
1
396
assert
edge_indices_list
[
0
].
shape
[
0
]
==
1
08
assert
edge_indices_list
[
1
].
shape
[
0
]
==
1
896
assert
edge_indices_list
[
1
].
shape
[
0
]
==
1
02
def
test_truncation_hanlde
():
def
test_truncation_hanlde
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment