Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
f3b13cad
"...git@developer.sourcefind.cn:kecinstone/2024-pra-vllm.git" did not exist on "621980bdc0d5a41e224febf962a6e0474e2b14ef"
Commit
f3b13cad
authored
May 17, 2023
by
yeshenglong1
Browse files
UpDate README.md
parent
0797920d
Changes
102
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3099 additions
and
3099 deletions
+3099
-3099
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/loading.py
...ine-HD-Map-Construction/src/datasets/pipelines/loading.py
+60
-60
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/poly_bbox.py
...e-HD-Map-Construction/src/datasets/pipelines/poly_bbox.py
+228
-228
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/transform.py
...e-HD-Map-Construction/src/datasets/pipelines/transform.py
+178
-178
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/vectorize.py
...e-HD-Map-Construction/src/datasets/pipelines/vectorize.py
+152
-152
autonomous_driving/Online-HD-Map-Construction/src/models/__init__.py
...driving/Online-HD-Map-Construction/src/models/__init__.py
+6
-6
autonomous_driving/Online-HD-Map-Construction/src/models/assigner/__init__.py
...nline-HD-Map-Construction/src/models/assigner/__init__.py
+2
-2
autonomous_driving/Online-HD-Map-Construction/src/models/assigner/assigner.py
...nline-HD-Map-Construction/src/models/assigner/assigner.py
+125
-125
autonomous_driving/Online-HD-Map-Construction/src/models/assigner/match_cost.py
...ine-HD-Map-Construction/src/models/assigner/match_cost.py
+367
-367
autonomous_driving/Online-HD-Map-Construction/src/models/augmentation/__init__.py
...e-HD-Map-Construction/src/models/augmentation/__init__.py
+0
-0
autonomous_driving/Online-HD-Map-Construction/src/models/augmentation/sythesis_det.py
...-Map-Construction/src/models/augmentation/sythesis_det.py
+187
-187
autonomous_driving/Online-HD-Map-Construction/src/models/backbones/__init__.py
...line-HD-Map-Construction/src/models/backbones/__init__.py
+5
-5
autonomous_driving/Online-HD-Map-Construction/src/models/backbones/internimage.py
...e-HD-Map-Construction/src/models/backbones/internimage.py
+0
-0
autonomous_driving/Online-HD-Map-Construction/src/models/backbones/ipm_backbone.py
...-HD-Map-Construction/src/models/backbones/ipm_backbone.py
+381
-381
autonomous_driving/Online-HD-Map-Construction/src/models/heads/__init__.py
...g/Online-HD-Map-Construction/src/models/heads/__init__.py
+3
-3
autonomous_driving/Online-HD-Map-Construction/src/models/heads/base_map_head.py
...ine-HD-Map-Construction/src/models/heads/base_map_head.py
+52
-52
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detgen_utils/__init__.py
...ap-Construction/src/models/heads/detgen_utils/__init__.py
+0
-0
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detgen_utils/causal_trans.py
...onstruction/src/models/heads/detgen_utils/causal_trans.py
+284
-284
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detgen_utils/utils.py
...D-Map-Construction/src/models/heads/detgen_utils/utils.py
+85
-85
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detr_bbox.py
.../Online-HD-Map-Construction/src/models/heads/detr_bbox.py
+483
-483
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detr_head.py
.../Online-HD-Map-Construction/src/models/heads/detr_head.py
+501
-501
No files found.
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/datasets/pipelines/loading.py
→
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/loading.py
View file @
f3b13cad
import
mmcv
import
numpy
as
np
from
mmdet.datasets.builder
import
PIPELINES
@
PIPELINES
.
register_module
(
force
=
True
)
class
LoadMultiViewImagesFromFiles
(
object
):
"""Load multi channel images from a list of separate channel files.
Expects results['img_filename'] to be a list of filenames.
Args:
to_float32 (bool): Whether to convert the img to float32.
Defaults to False.
color_type (str): Color type of the file. Defaults to 'unchanged'.
"""
def
__init__
(
self
,
to_float32
=
False
,
color_type
=
'unchanged'
):
self
.
to_float32
=
to_float32
self
.
color_type
=
color_type
def
__call__
(
self
,
results
):
"""Call function to load multi-view image from files.
Args:
results (dict): Result dict containing multi-view image filenames.
Returns:
dict: The result dict containing the multi-view image data. \
Added keys and values are described below.
- filename (str): Multi-view image filenames.
- img (np.ndarray): Multi-view image arrays.
- img_shape (tuple[int]): Shape of multi-view image arrays.
- ori_shape (tuple[int]): Shape of original image arrays.
- pad_shape (tuple[int]): Shape of padded image arrays.
- scale_factor (float): Scale factor.
- img_norm_cfg (dict): Normalization configuration of images.
"""
filename
=
results
[
'img_filenames'
]
img
=
[
mmcv
.
imread
(
name
,
self
.
color_type
)
for
name
in
filename
]
if
self
.
to_float32
:
img
=
[
i
.
astype
(
np
.
float32
)
for
i
in
img
]
results
[
'img'
]
=
img
results
[
'img_shape'
]
=
[
i
.
shape
for
i
in
img
]
results
[
'ori_shape'
]
=
[
i
.
shape
for
i
in
img
]
# Set initial values for default meta_keys
results
[
'pad_shape'
]
=
[
i
.
shape
for
i
in
img
]
# results['scale_factor'] = 1.0
num_channels
=
1
if
len
(
img
[
0
].
shape
)
<
3
else
img
[
0
].
shape
[
2
]
results
[
'img_norm_cfg'
]
=
dict
(
mean
=
np
.
zeros
(
num_channels
,
dtype
=
np
.
float32
),
std
=
np
.
ones
(
num_channels
,
dtype
=
np
.
float32
),
to_rgb
=
False
)
results
[
'img_fields'
]
=
[
'img'
]
return
results
def
__repr__
(
self
):
"""str: Return a string that describes the module."""
return
f
'
{
self
.
__class__
.
__name__
}
(to_float32=
{
self
.
to_float32
}
, '
\
f
"color_type='
{
self
.
color_type
}
')"
import
mmcv
import
numpy
as
np
from
mmdet.datasets.builder
import
PIPELINES
@
PIPELINES
.
register_module
(
force
=
True
)
class
LoadMultiViewImagesFromFiles
(
object
):
"""Load multi channel images from a list of separate channel files.
Expects results['img_filename'] to be a list of filenames.
Args:
to_float32 (bool): Whether to convert the img to float32.
Defaults to False.
color_type (str): Color type of the file. Defaults to 'unchanged'.
"""
def
__init__
(
self
,
to_float32
=
False
,
color_type
=
'unchanged'
):
self
.
to_float32
=
to_float32
self
.
color_type
=
color_type
def
__call__
(
self
,
results
):
"""Call function to load multi-view image from files.
Args:
results (dict): Result dict containing multi-view image filenames.
Returns:
dict: The result dict containing the multi-view image data.
\
Added keys and values are described below.
- filename (str): Multi-view image filenames.
- img (np.ndarray): Multi-view image arrays.
- img_shape (tuple[int]): Shape of multi-view image arrays.
- ori_shape (tuple[int]): Shape of original image arrays.
- pad_shape (tuple[int]): Shape of padded image arrays.
- scale_factor (float): Scale factor.
- img_norm_cfg (dict): Normalization configuration of images.
"""
filename
=
results
[
'img_filenames'
]
img
=
[
mmcv
.
imread
(
name
,
self
.
color_type
)
for
name
in
filename
]
if
self
.
to_float32
:
img
=
[
i
.
astype
(
np
.
float32
)
for
i
in
img
]
results
[
'img'
]
=
img
results
[
'img_shape'
]
=
[
i
.
shape
for
i
in
img
]
results
[
'ori_shape'
]
=
[
i
.
shape
for
i
in
img
]
# Set initial values for default meta_keys
results
[
'pad_shape'
]
=
[
i
.
shape
for
i
in
img
]
# results['scale_factor'] = 1.0
num_channels
=
1
if
len
(
img
[
0
].
shape
)
<
3
else
img
[
0
].
shape
[
2
]
results
[
'img_norm_cfg'
]
=
dict
(
mean
=
np
.
zeros
(
num_channels
,
dtype
=
np
.
float32
),
std
=
np
.
ones
(
num_channels
,
dtype
=
np
.
float32
),
to_rgb
=
False
)
results
[
'img_fields'
]
=
[
'img'
]
return
results
def
__repr__
(
self
):
"""str: Return a string that describes the module."""
return
f
'
{
self
.
__class__
.
__name__
}
(to_float32=
{
self
.
to_float32
}
, '
\
f
"color_type='
{
self
.
color_type
}
')"
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/datasets/pipelines/poly_bbox.py
→
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/poly_bbox.py
View file @
f3b13cad
import
numpy
as
np
from
mmdet.datasets.builder
import
PIPELINES
from
shapely.geometry
import
LineString
@
PIPELINES
.
register_module
(
force
=
True
)
class
PolygonizeLocalMapBbox
(
object
):
"""Pre-Processing used by vectormapnet model.
Args:
canvas_size (tuple or list): bev feature size
coord_dim (int): dimension of point's coordinate
num_class (int): number of classes
threshold (float): threshold for minimum bounding box size
"""
def
__init__
(
self
,
canvas_size
=
(
200
,
100
),
coord_dim
=
2
,
num_class
=
3
,
threshold
=
6
/
200
,
):
self
.
canvas_size
=
np
.
array
(
canvas_size
)
self
.
num_class
=
num_class
# for keypoints
self
.
threshold
=
threshold
self
.
coord_dim
=
coord_dim
self
.
map_stop_idx
=
0
self
.
coord_dim_start_idx
=
1
def
format_polyline_map
(
self
,
vectors
):
polylines
,
polyline_masks
,
polyline_weights
=
[],
[],
[]
# quantilize each label's lines individually.
for
label
,
_lines
in
vectors
.
items
():
for
polyline
in
_lines
:
# and pad polyline.
if
label
==
2
:
polyline_weight
=
evaluate_line
(
polyline
).
reshape
(
-
1
)
else
:
polyline_weight
=
np
.
ones_like
(
polyline
).
reshape
(
-
1
)
polyline_weight
=
np
.
pad
(
polyline_weight
,
((
0
,
1
),),
constant_values
=
1.
)
polyline_weight
=
polyline_weight
/
polyline_weight
.
sum
()
# flatten and quantilized
fpolyline
=
quantize_verts
(
polyline
,
self
.
canvas_size
,
self
.
coord_dim
)
fpolyline
=
fpolyline
.
reshape
(
-
1
)
# reindex starting from 1, and add a zero stopping token(EOS),
fpolyline
=
\
np
.
pad
(
fpolyline
+
self
.
coord_dim_start_idx
,
((
0
,
1
),),
constant_values
=
0
)
fpolyline_msk
=
np
.
ones
(
fpolyline
.
shape
,
dtype
=
np
.
bool
)
polyline_masks
.
append
(
fpolyline_msk
)
polyline_weights
.
append
(
polyline_weight
)
polylines
.
append
(
fpolyline
)
polyline_map
=
polylines
polyline_map_mask
=
polyline_masks
polyline_map_weights
=
polyline_weights
return
polyline_map
,
polyline_map_mask
,
polyline_map_weights
def
format_keypoint
(
self
,
vectors
):
kps
,
kp_labels
=
[],
[]
qkps
,
qkp_masks
=
[],
[]
# quantilize each label's lines individually.
for
label
,
_lines
in
vectors
.
items
():
for
polyline
in
_lines
:
kp
=
get_bbox
(
polyline
,
self
.
threshold
)
kps
.
append
(
kp
)
kp_labels
.
append
(
label
)
gkp
=
kp
# flatten and quantilized
fkp
=
quantize_verts
(
gkp
,
self
.
canvas_size
,
self
.
coord_dim
)
fkp
=
fkp
.
reshape
(
-
1
)
fkps_msk
=
np
.
ones
(
fkp
.
shape
,
dtype
=
np
.
bool
)
qkp_masks
.
append
(
fkps_msk
)
qkps
.
append
(
fkp
)
qkps
=
np
.
stack
(
qkps
)
qkp_msks
=
np
.
stack
(
qkp_masks
)
# format det
kps
=
np
.
stack
(
kps
,
axis
=
0
).
astype
(
np
.
float32
)
*
self
.
canvas_size
kp_labels
=
np
.
array
(
kp_labels
)
# restrict the boundary
kps
[...,
0
]
=
np
.
clip
(
kps
[...,
0
],
0.1
,
self
.
canvas_size
[
0
]
-
0.1
)
kps
[...,
1
]
=
np
.
clip
(
kps
[...,
1
],
0.1
,
self
.
canvas_size
[
1
]
-
0.1
)
# nbox, boxsize(4)*coord_dim(2)
kps
=
kps
.
reshape
(
kps
.
shape
[
0
],
-
1
)
# unflatten_seq(qkps)
return
kps
,
kp_labels
,
qkps
,
qkp_msks
,
def
Polygonization
(
self
,
input_dict
):
'''
Process vertices.
'''
vectors
=
input_dict
[
'vectors'
]
n_lines
=
0
for
label
,
lines
in
vectors
.
items
():
n_lines
+=
len
(
lines
)
if
not
n_lines
:
input_dict
[
'polys'
]
=
[]
return
input_dict
polyline_map
,
polyline_map_mask
,
polyline_map_weight
=
\
self
.
format_polyline_map
(
vectors
)
keypoint
,
keypoint_label
,
qkeypoint
,
qkeypoint_mask
=
\
self
.
format_keypoint
(
vectors
)
# gather
polys
=
{
# for det
'keypoint'
:
keypoint
,
'det_label'
:
keypoint_label
,
# for gen
'gen_label'
:
keypoint_label
,
'qkeypoint'
:
qkeypoint
,
'qkeypoint_mask'
:
qkeypoint_mask
,
'polylines'
:
polyline_map
,
# List[array]
'polyline_masks'
:
polyline_map_mask
,
# List[array]
'polyline_weights'
:
polyline_map_weight
}
# Format outputs
input_dict
[
'polys'
]
=
polys
return
input_dict
def
__call__
(
self
,
input_dict
):
input_dict
=
self
.
Polygonization
(
input_dict
)
return
input_dict
def
evaluate_line
(
polyline
):
edge
=
np
.
linalg
.
norm
(
polyline
[
1
:]
-
polyline
[:
-
1
],
axis
=-
1
)
start_end_weight
=
edge
[(
0
,
-
1
),
].
copy
()
mid_weight
=
(
edge
[:
-
1
]
+
edge
[
1
:])
*
.
5
pts_weight
=
np
.
concatenate
(
(
start_end_weight
[:
1
],
mid_weight
,
start_end_weight
[
-
1
:]))
denominator
=
pts_weight
.
sum
()
denominator
=
1
if
denominator
==
0
else
denominator
pts_weight
/=
denominator
# add weights for stop index
pts_weight
=
np
.
repeat
(
pts_weight
,
2
)
/
2
pts_weight
=
np
.
pad
(
pts_weight
,
((
0
,
1
)),
constant_values
=
1
/
(
len
(
polyline
)
*
2
))
return
pts_weight
def
quantize_verts
(
verts
,
canvas_size
,
coord_dim
):
"""Convert vertices from its original range ([-1,1]) to discrete values in [0, n_bits**2 - 1].
Args:
verts (array): vertices coordinates, shape (seqlen, coords_dim)
canvas_size (tuple): bev feature size
coord_dim (int): dimension of point coordinates
Returns:
quantized_verts (array): quantized vertices, shape (seqlen, coords_dim)
"""
min_range
=
0
max_range
=
1
range_quantize
=
np
.
array
(
canvas_size
)
-
1
# (0-199) = 200
verts_ratio
=
(
verts
[:,
:
coord_dim
]
-
min_range
)
/
(
max_range
-
min_range
)
verts_quantize
=
verts_ratio
*
range_quantize
[:
coord_dim
]
return
verts_quantize
.
astype
(
'int32'
)
def
get_bbox
(
polyline
,
threshold
):
"""Convert vertices from its original range ([-1,1]) to discrete values in [0, n_bits**2 - 1].
Args:
polyline (array): point coordinates, shape (seqlen, 2)
threshold (float): threshold for minimum bbox size
Returns:
bbox (array): bounding box in xyxy format, shape (2, 2)
"""
eps
=
1e-4
polyline
=
LineString
(
polyline
)
bbox
=
polyline
.
bounds
minx
,
miny
,
maxx
,
maxy
=
bbox
W
,
H
=
maxx
-
minx
,
maxy
-
miny
if
W
<
threshold
or
H
<
threshold
:
remain
=
max
((
threshold
-
min
(
W
,
H
))
/
2
,
eps
)
bbox
=
polyline
.
buffer
(
remain
).
envelope
.
bounds
minx
,
miny
,
maxx
,
maxy
=
bbox
bbox_np
=
np
.
array
([[
minx
,
miny
],
[
maxx
,
maxy
]])
bbox_np
=
np
.
clip
(
bbox_np
,
0.
,
1.
)
import
numpy
as
np
from
mmdet.datasets.builder
import
PIPELINES
from
shapely.geometry
import
LineString
@
PIPELINES
.
register_module
(
force
=
True
)
class
PolygonizeLocalMapBbox
(
object
):
"""Pre-Processing used by vectormapnet model.
Args:
canvas_size (tuple or list): bev feature size
coord_dim (int): dimension of point's coordinate
num_class (int): number of classes
threshold (float): threshold for minimum bounding box size
"""
def
__init__
(
self
,
canvas_size
=
(
200
,
100
),
coord_dim
=
2
,
num_class
=
3
,
threshold
=
6
/
200
,
):
self
.
canvas_size
=
np
.
array
(
canvas_size
)
self
.
num_class
=
num_class
# for keypoints
self
.
threshold
=
threshold
self
.
coord_dim
=
coord_dim
self
.
map_stop_idx
=
0
self
.
coord_dim_start_idx
=
1
def
format_polyline_map
(
self
,
vectors
):
polylines
,
polyline_masks
,
polyline_weights
=
[],
[],
[]
# quantilize each label's lines individually.
for
label
,
_lines
in
vectors
.
items
():
for
polyline
in
_lines
:
# and pad polyline.
if
label
==
2
:
polyline_weight
=
evaluate_line
(
polyline
).
reshape
(
-
1
)
else
:
polyline_weight
=
np
.
ones_like
(
polyline
).
reshape
(
-
1
)
polyline_weight
=
np
.
pad
(
polyline_weight
,
((
0
,
1
),),
constant_values
=
1.
)
polyline_weight
=
polyline_weight
/
polyline_weight
.
sum
()
# flatten and quantilized
fpolyline
=
quantize_verts
(
polyline
,
self
.
canvas_size
,
self
.
coord_dim
)
fpolyline
=
fpolyline
.
reshape
(
-
1
)
# reindex starting from 1, and add a zero stopping token(EOS),
fpolyline
=
\
np
.
pad
(
fpolyline
+
self
.
coord_dim_start_idx
,
((
0
,
1
),),
constant_values
=
0
)
fpolyline_msk
=
np
.
ones
(
fpolyline
.
shape
,
dtype
=
np
.
bool
)
polyline_masks
.
append
(
fpolyline_msk
)
polyline_weights
.
append
(
polyline_weight
)
polylines
.
append
(
fpolyline
)
polyline_map
=
polylines
polyline_map_mask
=
polyline_masks
polyline_map_weights
=
polyline_weights
return
polyline_map
,
polyline_map_mask
,
polyline_map_weights
def
format_keypoint
(
self
,
vectors
):
kps
,
kp_labels
=
[],
[]
qkps
,
qkp_masks
=
[],
[]
# quantilize each label's lines individually.
for
label
,
_lines
in
vectors
.
items
():
for
polyline
in
_lines
:
kp
=
get_bbox
(
polyline
,
self
.
threshold
)
kps
.
append
(
kp
)
kp_labels
.
append
(
label
)
gkp
=
kp
# flatten and quantilized
fkp
=
quantize_verts
(
gkp
,
self
.
canvas_size
,
self
.
coord_dim
)
fkp
=
fkp
.
reshape
(
-
1
)
fkps_msk
=
np
.
ones
(
fkp
.
shape
,
dtype
=
np
.
bool
)
qkp_masks
.
append
(
fkps_msk
)
qkps
.
append
(
fkp
)
qkps
=
np
.
stack
(
qkps
)
qkp_msks
=
np
.
stack
(
qkp_masks
)
# format det
kps
=
np
.
stack
(
kps
,
axis
=
0
).
astype
(
np
.
float32
)
*
self
.
canvas_size
kp_labels
=
np
.
array
(
kp_labels
)
# restrict the boundary
kps
[...,
0
]
=
np
.
clip
(
kps
[...,
0
],
0.1
,
self
.
canvas_size
[
0
]
-
0.1
)
kps
[...,
1
]
=
np
.
clip
(
kps
[...,
1
],
0.1
,
self
.
canvas_size
[
1
]
-
0.1
)
# nbox, boxsize(4)*coord_dim(2)
kps
=
kps
.
reshape
(
kps
.
shape
[
0
],
-
1
)
# unflatten_seq(qkps)
return
kps
,
kp_labels
,
qkps
,
qkp_msks
,
def
Polygonization
(
self
,
input_dict
):
'''
Process vertices.
'''
vectors
=
input_dict
[
'vectors'
]
n_lines
=
0
for
label
,
lines
in
vectors
.
items
():
n_lines
+=
len
(
lines
)
if
not
n_lines
:
input_dict
[
'polys'
]
=
[]
return
input_dict
polyline_map
,
polyline_map_mask
,
polyline_map_weight
=
\
self
.
format_polyline_map
(
vectors
)
keypoint
,
keypoint_label
,
qkeypoint
,
qkeypoint_mask
=
\
self
.
format_keypoint
(
vectors
)
# gather
polys
=
{
# for det
'keypoint'
:
keypoint
,
'det_label'
:
keypoint_label
,
# for gen
'gen_label'
:
keypoint_label
,
'qkeypoint'
:
qkeypoint
,
'qkeypoint_mask'
:
qkeypoint_mask
,
'polylines'
:
polyline_map
,
# List[array]
'polyline_masks'
:
polyline_map_mask
,
# List[array]
'polyline_weights'
:
polyline_map_weight
}
# Format outputs
input_dict
[
'polys'
]
=
polys
return
input_dict
def
__call__
(
self
,
input_dict
):
input_dict
=
self
.
Polygonization
(
input_dict
)
return
input_dict
def
evaluate_line
(
polyline
):
edge
=
np
.
linalg
.
norm
(
polyline
[
1
:]
-
polyline
[:
-
1
],
axis
=-
1
)
start_end_weight
=
edge
[(
0
,
-
1
),
].
copy
()
mid_weight
=
(
edge
[:
-
1
]
+
edge
[
1
:])
*
.
5
pts_weight
=
np
.
concatenate
(
(
start_end_weight
[:
1
],
mid_weight
,
start_end_weight
[
-
1
:]))
denominator
=
pts_weight
.
sum
()
denominator
=
1
if
denominator
==
0
else
denominator
pts_weight
/=
denominator
# add weights for stop index
pts_weight
=
np
.
repeat
(
pts_weight
,
2
)
/
2
pts_weight
=
np
.
pad
(
pts_weight
,
((
0
,
1
)),
constant_values
=
1
/
(
len
(
polyline
)
*
2
))
return
pts_weight
def
quantize_verts
(
verts
,
canvas_size
,
coord_dim
):
"""Convert vertices from its original range ([-1,1]) to discrete values in [0, n_bits**2 - 1].
Args:
verts (array): vertices coordinates, shape (seqlen, coords_dim)
canvas_size (tuple): bev feature size
coord_dim (int): dimension of point coordinates
Returns:
quantized_verts (array): quantized vertices, shape (seqlen, coords_dim)
"""
min_range
=
0
max_range
=
1
range_quantize
=
np
.
array
(
canvas_size
)
-
1
# (0-199) = 200
verts_ratio
=
(
verts
[:,
:
coord_dim
]
-
min_range
)
/
(
max_range
-
min_range
)
verts_quantize
=
verts_ratio
*
range_quantize
[:
coord_dim
]
return
verts_quantize
.
astype
(
'int32'
)
def
get_bbox
(
polyline
,
threshold
):
"""Convert vertices from its original range ([-1,1]) to discrete values in [0, n_bits**2 - 1].
Args:
polyline (array): point coordinates, shape (seqlen, 2)
threshold (float): threshold for minimum bbox size
Returns:
bbox (array): bounding box in xyxy format, shape (2, 2)
"""
eps
=
1e-4
polyline
=
LineString
(
polyline
)
bbox
=
polyline
.
bounds
minx
,
miny
,
maxx
,
maxy
=
bbox
W
,
H
=
maxx
-
minx
,
maxy
-
miny
if
W
<
threshold
or
H
<
threshold
:
remain
=
max
((
threshold
-
min
(
W
,
H
))
/
2
,
eps
)
bbox
=
polyline
.
buffer
(
remain
).
envelope
.
bounds
minx
,
miny
,
maxx
,
maxy
=
bbox
bbox_np
=
np
.
array
([[
minx
,
miny
],
[
maxx
,
maxy
]])
bbox_np
=
np
.
clip
(
bbox_np
,
0.
,
1.
)
return
bbox_np
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/datasets/pipelines/transform.py
→
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/transform.py
View file @
f3b13cad
import
numpy
as
np
import
mmcv
from
mmdet.datasets.builder
import
PIPELINES
@
PIPELINES
.
register_module
(
force
=
True
)
class
Normalize3D
(
object
):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def
__init__
(
self
,
mean
,
std
,
to_rgb
=
True
):
self
.
mean
=
np
.
array
(
mean
,
dtype
=
np
.
float32
)
self
.
std
=
np
.
array
(
std
,
dtype
=
np
.
float32
)
self
.
to_rgb
=
to_rgb
def
__call__
(
self
,
results
):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
for
key
in
results
.
get
(
'img_fields'
,
[
'img'
]):
results
[
key
]
=
[
mmcv
.
imnormalize
(
img
,
self
.
mean
,
self
.
std
,
self
.
to_rgb
)
for
img
in
results
[
key
]]
results
[
'img_norm_cfg'
]
=
dict
(
mean
=
self
.
mean
,
std
=
self
.
std
,
to_rgb
=
self
.
to_rgb
)
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(mean=
{
self
.
mean
}
, std=
{
self
.
std
}
, to_rgb=
{
self
.
to_rgb
}
)'
return
repr_str
@
PIPELINES
.
register_module
(
force
=
True
)
class
PadMultiViewImages
(
object
):
"""Pad multi-view images and change intrinsics
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed.
Args:
size (tuple, optional): Fixed padding size, (h, w).
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
change_intrinsics (bool): whether to update intrinsics.
"""
def
__init__
(
self
,
size
=
None
,
size_divisor
=
None
,
pad_val
=
0
,
change_intrinsics
=
False
):
self
.
size
=
size
self
.
size_divisor
=
size_divisor
self
.
pad_val
=
pad_val
# only one of size and size_divisor should be valid
assert
size
is
not
None
or
size_divisor
is
not
None
assert
size
is
None
or
size_divisor
is
None
self
.
change_intrinsics
=
change_intrinsics
def
_pad_img
(
self
,
results
):
"""Pad images according to ``self.size``."""
original_shape
=
[
img
.
shape
for
img
in
results
[
'img'
]]
for
key
in
results
.
get
(
'img_fields'
,
[
'img'
]):
if
self
.
size
is
not
None
:
padded_img
=
[
mmcv
.
impad
(
img
,
shape
=
self
.
size
,
pad_val
=
self
.
pad_val
)
for
img
in
results
[
key
]]
elif
self
.
size_divisor
is
not
None
:
padded_img
=
[
mmcv
.
impad_to_multiple
(
img
,
self
.
size_divisor
,
pad_val
=
self
.
pad_val
)
for
img
in
results
[
key
]]
results
[
key
]
=
padded_img
if
self
.
change_intrinsics
:
post_intrinsics
,
post_ego2imgs
=
[],
[]
for
img
,
oshape
,
cam_intrinsic
,
ego2img
in
zip
(
results
[
'img'
],
\
original_shape
,
results
[
'cam_intrinsics'
],
results
[
'ego2img'
]):
scaleW
=
img
.
shape
[
1
]
/
oshape
[
1
]
scaleH
=
img
.
shape
[
0
]
/
oshape
[
0
]
rot_resize_matrix
=
np
.
array
([
[
scaleW
,
0
,
0
,
0
],
[
0
,
scaleH
,
0
,
0
],
[
0
,
0
,
1
,
0
],
[
0
,
0
,
0
,
1
]])
post_intrinsic
=
rot_resize_matrix
[:
3
,
:
3
]
@
cam_intrinsic
post_ego2img
=
rot_resize_matrix
@
ego2img
post_intrinsics
.
append
(
post_intrinsic
)
post_ego2imgs
.
append
(
post_ego2img
)
results
.
update
({
'cam_intrinsics'
:
post_intrinsics
,
'ego2img'
:
post_ego2imgs
,
})
results
[
'img_shape'
]
=
[
img
.
shape
for
img
in
padded_img
]
results
[
'img_fixed_size'
]
=
self
.
size
results
[
'img_size_divisor'
]
=
self
.
size_divisor
def
__call__
(
self
,
results
):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self
.
_pad_img
(
results
)
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(size=
{
self
.
size
}
, '
repr_str
+=
f
'size_divisor=
{
self
.
size_divisor
}
, '
repr_str
+=
f
'pad_val=
{
self
.
pad_val
}
)'
repr_str
+=
f
'change_intrinsics=
{
self
.
change_intrinsics
}
)'
return
repr_str
@
PIPELINES
.
register_module
(
force
=
True
)
class
ResizeMultiViewImages
(
object
):
"""Resize mulit-view images and change intrinsics
If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed
Args:
size (tuple, optional): resize target size, (h, w).
change_intrinsics (bool): whether to update intrinsics.
"""
def
__init__
(
self
,
size
,
change_intrinsics
=
True
):
self
.
size
=
size
self
.
change_intrinsics
=
change_intrinsics
def
__call__
(
self
,
results
:
dict
):
new_imgs
,
post_intrinsics
,
post_ego2imgs
=
[],
[],
[]
for
img
,
cam_intrinsic
,
ego2img
in
zip
(
results
[
'img'
],
\
results
[
'cam_intrinsics'
],
results
[
'ego2img'
]):
tmp
,
scaleW
,
scaleH
=
mmcv
.
imresize
(
img
,
# NOTE: mmcv.imresize expect (w, h) shape
(
self
.
size
[
1
],
self
.
size
[
0
]),
return_scale
=
True
)
new_imgs
.
append
(
tmp
)
rot_resize_matrix
=
np
.
array
([
[
scaleW
,
0
,
0
,
0
],
[
0
,
scaleH
,
0
,
0
],
[
0
,
0
,
1
,
0
],
[
0
,
0
,
0
,
1
]])
post_intrinsic
=
rot_resize_matrix
[:
3
,
:
3
]
@
cam_intrinsic
post_ego2img
=
rot_resize_matrix
@
ego2img
post_intrinsics
.
append
(
post_intrinsic
)
post_ego2imgs
.
append
(
post_ego2img
)
results
[
'img'
]
=
new_imgs
results
[
'img_shape'
]
=
[
img
.
shape
for
img
in
new_imgs
]
if
self
.
change_intrinsics
:
results
.
update
({
'cam_intrinsics'
:
post_intrinsics
,
'ego2img'
:
post_ego2imgs
,
})
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(size=
{
self
.
size
}
, '
repr_str
+=
f
'change_intrinsics=
{
self
.
change_intrinsics
}
)'
import
numpy
as
np
import
mmcv
from
mmdet.datasets.builder
import
PIPELINES
@
PIPELINES
.
register_module
(
force
=
True
)
class
Normalize3D
(
object
):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def
__init__
(
self
,
mean
,
std
,
to_rgb
=
True
):
self
.
mean
=
np
.
array
(
mean
,
dtype
=
np
.
float32
)
self
.
std
=
np
.
array
(
std
,
dtype
=
np
.
float32
)
self
.
to_rgb
=
to_rgb
def
__call__
(
self
,
results
):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
for
key
in
results
.
get
(
'img_fields'
,
[
'img'
]):
results
[
key
]
=
[
mmcv
.
imnormalize
(
img
,
self
.
mean
,
self
.
std
,
self
.
to_rgb
)
for
img
in
results
[
key
]]
results
[
'img_norm_cfg'
]
=
dict
(
mean
=
self
.
mean
,
std
=
self
.
std
,
to_rgb
=
self
.
to_rgb
)
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(mean=
{
self
.
mean
}
, std=
{
self
.
std
}
, to_rgb=
{
self
.
to_rgb
}
)'
return
repr_str
@
PIPELINES
.
register_module
(
force
=
True
)
class
PadMultiViewImages
(
object
):
"""Pad multi-view images and change intrinsics
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed.
Args:
size (tuple, optional): Fixed padding size, (h, w).
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
change_intrinsics (bool): whether to update intrinsics.
"""
def
__init__
(
self
,
size
=
None
,
size_divisor
=
None
,
pad_val
=
0
,
change_intrinsics
=
False
):
self
.
size
=
size
self
.
size_divisor
=
size_divisor
self
.
pad_val
=
pad_val
# only one of size and size_divisor should be valid
assert
size
is
not
None
or
size_divisor
is
not
None
assert
size
is
None
or
size_divisor
is
None
self
.
change_intrinsics
=
change_intrinsics
def
_pad_img
(
self
,
results
):
"""Pad images according to ``self.size``."""
original_shape
=
[
img
.
shape
for
img
in
results
[
'img'
]]
for
key
in
results
.
get
(
'img_fields'
,
[
'img'
]):
if
self
.
size
is
not
None
:
padded_img
=
[
mmcv
.
impad
(
img
,
shape
=
self
.
size
,
pad_val
=
self
.
pad_val
)
for
img
in
results
[
key
]]
elif
self
.
size_divisor
is
not
None
:
padded_img
=
[
mmcv
.
impad_to_multiple
(
img
,
self
.
size_divisor
,
pad_val
=
self
.
pad_val
)
for
img
in
results
[
key
]]
results
[
key
]
=
padded_img
if
self
.
change_intrinsics
:
post_intrinsics
,
post_ego2imgs
=
[],
[]
for
img
,
oshape
,
cam_intrinsic
,
ego2img
in
zip
(
results
[
'img'
],
\
original_shape
,
results
[
'cam_intrinsics'
],
results
[
'ego2img'
]):
scaleW
=
img
.
shape
[
1
]
/
oshape
[
1
]
scaleH
=
img
.
shape
[
0
]
/
oshape
[
0
]
rot_resize_matrix
=
np
.
array
([
[
scaleW
,
0
,
0
,
0
],
[
0
,
scaleH
,
0
,
0
],
[
0
,
0
,
1
,
0
],
[
0
,
0
,
0
,
1
]])
post_intrinsic
=
rot_resize_matrix
[:
3
,
:
3
]
@
cam_intrinsic
post_ego2img
=
rot_resize_matrix
@
ego2img
post_intrinsics
.
append
(
post_intrinsic
)
post_ego2imgs
.
append
(
post_ego2img
)
results
.
update
({
'cam_intrinsics'
:
post_intrinsics
,
'ego2img'
:
post_ego2imgs
,
})
results
[
'img_shape'
]
=
[
img
.
shape
for
img
in
padded_img
]
results
[
'img_fixed_size'
]
=
self
.
size
results
[
'img_size_divisor'
]
=
self
.
size_divisor
def
__call__
(
self
,
results
):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self
.
_pad_img
(
results
)
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(size=
{
self
.
size
}
, '
repr_str
+=
f
'size_divisor=
{
self
.
size_divisor
}
, '
repr_str
+=
f
'pad_val=
{
self
.
pad_val
}
)'
repr_str
+=
f
'change_intrinsics=
{
self
.
change_intrinsics
}
)'
return
repr_str
@
PIPELINES
.
register_module
(
force
=
True
)
class
ResizeMultiViewImages
(
object
):
"""Resize mulit-view images and change intrinsics
If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed
Args:
size (tuple, optional): resize target size, (h, w).
change_intrinsics (bool): whether to update intrinsics.
"""
def
__init__
(
self
,
size
,
change_intrinsics
=
True
):
self
.
size
=
size
self
.
change_intrinsics
=
change_intrinsics
def
__call__
(
self
,
results
:
dict
):
new_imgs
,
post_intrinsics
,
post_ego2imgs
=
[],
[],
[]
for
img
,
cam_intrinsic
,
ego2img
in
zip
(
results
[
'img'
],
\
results
[
'cam_intrinsics'
],
results
[
'ego2img'
]):
tmp
,
scaleW
,
scaleH
=
mmcv
.
imresize
(
img
,
# NOTE: mmcv.imresize expect (w, h) shape
(
self
.
size
[
1
],
self
.
size
[
0
]),
return_scale
=
True
)
new_imgs
.
append
(
tmp
)
rot_resize_matrix
=
np
.
array
([
[
scaleW
,
0
,
0
,
0
],
[
0
,
scaleH
,
0
,
0
],
[
0
,
0
,
1
,
0
],
[
0
,
0
,
0
,
1
]])
post_intrinsic
=
rot_resize_matrix
[:
3
,
:
3
]
@
cam_intrinsic
post_ego2img
=
rot_resize_matrix
@
ego2img
post_intrinsics
.
append
(
post_intrinsic
)
post_ego2imgs
.
append
(
post_ego2img
)
results
[
'img'
]
=
new_imgs
results
[
'img_shape'
]
=
[
img
.
shape
for
img
in
new_imgs
]
if
self
.
change_intrinsics
:
results
.
update
({
'cam_intrinsics'
:
post_intrinsics
,
'ego2img'
:
post_ego2imgs
,
})
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(size=
{
self
.
size
}
, '
repr_str
+=
f
'change_intrinsics=
{
self
.
change_intrinsics
}
)'
return
repr_str
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/datasets/pipelines/vectorize.py
→
autonomous_driving/Online-HD-Map-Construction/src/datasets/pipelines/vectorize.py
View file @
f3b13cad
import
numpy
as
np
from
mmdet.datasets.builder
import
PIPELINES
from
shapely.geometry
import
LineString
from
numpy.typing
import
NDArray
from
typing
import
List
,
Tuple
,
Union
,
Dict
@
PIPELINES
.
register_module
(
force
=
True
)
class
VectorizeMap
(
object
):
"""Generate vectoized map and put into `semantic_mask` key.
Concretely, shapely geometry objects are converted into sample points (ndarray).
We use args `sample_num`, `sample_dist`, `simplify` to specify sampling method.
Args:
roi_size (tuple or list): bev range .
normalize (bool): whether to normalize points to range (0, 1).
coords_dim (int): dimension of point coordinates.
simplify (bool): whether to use simpily function. If true, `sample_num` \
and `sample_dist` will be ignored.
sample_num (int): number of points to interpolate from a polyline. Set to -1 to ignore.
sample_dist (float): interpolate distance. Set to -1 to ignore.
"""
def
__init__
(
self
,
roi_size
:
Union
[
Tuple
,
List
],
normalize
:
bool
,
coords_dim
:
int
,
simplify
:
bool
=
False
,
sample_num
:
int
=-
1
,
sample_dist
:
float
=-
1
,
):
self
.
coords_dim
=
coords_dim
self
.
sample_num
=
sample_num
self
.
sample_dist
=
sample_dist
self
.
roi_size
=
np
.
array
(
roi_size
)
self
.
normalize
=
normalize
self
.
simplify
=
simplify
self
.
sample_fn
=
None
if
sample_dist
>
0
:
assert
sample_num
<
0
and
not
simplify
self
.
sample_fn
=
self
.
interp_fixed_dist
if
sample_num
>
0
:
assert
sample_dist
<
0
and
not
simplify
self
.
sample_fn
=
self
.
interp_fixed_num
def
interp_fixed_num
(
self
,
line
:
LineString
)
->
NDArray
:
''' Interpolate a line to fixed number of points.
Args:
line (LineString): line
Returns:
points (array): interpolated points, shape (N, 2)
'''
distances
=
np
.
linspace
(
0
,
line
.
length
,
self
.
sample_num
)
sampled_points
=
np
.
array
([
list
(
line
.
interpolate
(
distance
).
coords
)
for
distance
in
distances
]).
squeeze
()
return
sampled_points
def
interp_fixed_dist
(
self
,
line
:
LineString
)
->
NDArray
:
''' Interpolate a line at fixed interval.
Args:
line (LineString): line
Returns:
points (array): interpolated points, shape (N, 2)
'''
distances
=
list
(
np
.
arange
(
self
.
sample_dist
,
line
.
length
,
self
.
sample_dist
))
# make sure to sample at least two points when sample_dist > line.length
distances
=
[
0
,]
+
distances
+
[
line
.
length
,]
sampled_points
=
np
.
array
([
list
(
line
.
interpolate
(
distance
).
coords
)
for
distance
in
distances
]).
squeeze
()
return
sampled_points
def
get_vectorized_lines
(
self
,
map_geoms
:
Dict
)
->
Dict
:
''' Vectorize map elements. Iterate over the input dict and apply the
specified sample funcion.
Args:
line (LineString): line
Returns:
vectors (array): dict of vectorized map elements.
'''
vectors
=
{}
for
label
,
geom_list
in
map_geoms
.
items
():
vectors
[
label
]
=
[]
for
geom
in
geom_list
:
if
geom
.
geom_type
==
'LineString'
:
geom
=
LineString
(
np
.
array
(
geom
.
coords
)[:,
:
self
.
coords_dim
])
if
self
.
simplify
:
line
=
geom
.
simplify
(
0.2
,
preserve_topology
=
True
)
line
=
np
.
array
(
line
.
coords
)
elif
self
.
sample_fn
:
line
=
self
.
sample_fn
(
geom
)
else
:
line
=
np
.
array
(
line
.
coords
)
if
self
.
normalize
:
line
=
self
.
normalize_line
(
line
)
vectors
[
label
].
append
(
line
)
elif
geom
.
geom_type
==
'Polygon'
:
# polygon objects will not be vectorized
continue
else
:
raise
ValueError
(
'map geoms must be either LineString or Polygon!'
)
return
vectors
def
normalize_line
(
self
,
line
:
NDArray
)
->
NDArray
:
''' Convert points to range (0, 1).
Args:
line (LineString): line
Returns:
normalized (array): normalized points.
'''
origin
=
-
np
.
array
([
self
.
roi_size
[
0
]
/
2
,
self
.
roi_size
[
1
]
/
2
])
line
[:,
:
2
]
=
line
[:,
:
2
]
-
origin
# transform from range [0, 1] to (0, 1)
eps
=
2
line
[:,
:
2
]
=
line
[:,
:
2
]
/
(
self
.
roi_size
+
eps
)
return
line
def
__call__
(
self
,
input_dict
):
map_geoms
=
input_dict
[
'map_geoms'
]
input_dict
[
'vectors'
]
=
self
.
get_vectorized_lines
(
map_geoms
)
return
input_dict
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(simplify=
{
self
.
simplify
}
, '
repr_str
+=
f
'sample_num=
{
self
.
sample_num
}
), '
repr_str
+=
f
'sample_dist=
{
self
.
sample_dist
}
), '
repr_str
+=
f
'roi_size=
{
self
.
roi_size
}
)'
repr_str
+=
f
'normalize=
{
self
.
normalize
}
)'
repr_str
+=
f
'coords_dim=
{
self
.
coords_dim
}
)'
import
numpy
as
np
from
mmdet.datasets.builder
import
PIPELINES
from
shapely.geometry
import
LineString
from
numpy.typing
import
NDArray
from
typing
import
List
,
Tuple
,
Union
,
Dict
@
PIPELINES
.
register_module
(
force
=
True
)
class
VectorizeMap
(
object
):
"""Generate vectoized map and put into `semantic_mask` key.
Concretely, shapely geometry objects are converted into sample points (ndarray).
We use args `sample_num`, `sample_dist`, `simplify` to specify sampling method.
Args:
roi_size (tuple or list): bev range .
normalize (bool): whether to normalize points to range (0, 1).
coords_dim (int): dimension of point coordinates.
simplify (bool): whether to use simpily function. If true, `sample_num`
\
and `sample_dist` will be ignored.
sample_num (int): number of points to interpolate from a polyline. Set to -1 to ignore.
sample_dist (float): interpolate distance. Set to -1 to ignore.
"""
def
__init__
(
self
,
roi_size
:
Union
[
Tuple
,
List
],
normalize
:
bool
,
coords_dim
:
int
,
simplify
:
bool
=
False
,
sample_num
:
int
=-
1
,
sample_dist
:
float
=-
1
,
):
self
.
coords_dim
=
coords_dim
self
.
sample_num
=
sample_num
self
.
sample_dist
=
sample_dist
self
.
roi_size
=
np
.
array
(
roi_size
)
self
.
normalize
=
normalize
self
.
simplify
=
simplify
self
.
sample_fn
=
None
if
sample_dist
>
0
:
assert
sample_num
<
0
and
not
simplify
self
.
sample_fn
=
self
.
interp_fixed_dist
if
sample_num
>
0
:
assert
sample_dist
<
0
and
not
simplify
self
.
sample_fn
=
self
.
interp_fixed_num
def
interp_fixed_num
(
self
,
line
:
LineString
)
->
NDArray
:
''' Interpolate a line to fixed number of points.
Args:
line (LineString): line
Returns:
points (array): interpolated points, shape (N, 2)
'''
distances
=
np
.
linspace
(
0
,
line
.
length
,
self
.
sample_num
)
sampled_points
=
np
.
array
([
list
(
line
.
interpolate
(
distance
).
coords
)
for
distance
in
distances
]).
squeeze
()
return
sampled_points
def
interp_fixed_dist
(
self
,
line
:
LineString
)
->
NDArray
:
''' Interpolate a line at fixed interval.
Args:
line (LineString): line
Returns:
points (array): interpolated points, shape (N, 2)
'''
distances
=
list
(
np
.
arange
(
self
.
sample_dist
,
line
.
length
,
self
.
sample_dist
))
# make sure to sample at least two points when sample_dist > line.length
distances
=
[
0
,]
+
distances
+
[
line
.
length
,]
sampled_points
=
np
.
array
([
list
(
line
.
interpolate
(
distance
).
coords
)
for
distance
in
distances
]).
squeeze
()
return
sampled_points
def
get_vectorized_lines
(
self
,
map_geoms
:
Dict
)
->
Dict
:
''' Vectorize map elements. Iterate over the input dict and apply the
specified sample funcion.
Args:
line (LineString): line
Returns:
vectors (array): dict of vectorized map elements.
'''
vectors
=
{}
for
label
,
geom_list
in
map_geoms
.
items
():
vectors
[
label
]
=
[]
for
geom
in
geom_list
:
if
geom
.
geom_type
==
'LineString'
:
geom
=
LineString
(
np
.
array
(
geom
.
coords
)[:,
:
self
.
coords_dim
])
if
self
.
simplify
:
line
=
geom
.
simplify
(
0.2
,
preserve_topology
=
True
)
line
=
np
.
array
(
line
.
coords
)
elif
self
.
sample_fn
:
line
=
self
.
sample_fn
(
geom
)
else
:
line
=
np
.
array
(
line
.
coords
)
if
self
.
normalize
:
line
=
self
.
normalize_line
(
line
)
vectors
[
label
].
append
(
line
)
elif
geom
.
geom_type
==
'Polygon'
:
# polygon objects will not be vectorized
continue
else
:
raise
ValueError
(
'map geoms must be either LineString or Polygon!'
)
return
vectors
def
normalize_line
(
self
,
line
:
NDArray
)
->
NDArray
:
''' Convert points to range (0, 1).
Args:
line (LineString): line
Returns:
normalized (array): normalized points.
'''
origin
=
-
np
.
array
([
self
.
roi_size
[
0
]
/
2
,
self
.
roi_size
[
1
]
/
2
])
line
[:,
:
2
]
=
line
[:,
:
2
]
-
origin
# transform from range [0, 1] to (0, 1)
eps
=
2
line
[:,
:
2
]
=
line
[:,
:
2
]
/
(
self
.
roi_size
+
eps
)
return
line
def
__call__
(
self
,
input_dict
):
map_geoms
=
input_dict
[
'map_geoms'
]
input_dict
[
'vectors'
]
=
self
.
get_vectorized_lines
(
map_geoms
)
return
input_dict
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(simplify=
{
self
.
simplify
}
, '
repr_str
+=
f
'sample_num=
{
self
.
sample_num
}
), '
repr_str
+=
f
'sample_dist=
{
self
.
sample_dist
}
), '
repr_str
+=
f
'roi_size=
{
self
.
roi_size
}
)'
repr_str
+=
f
'normalize=
{
self
.
normalize
}
)'
repr_str
+=
f
'coords_dim=
{
self
.
coords_dim
}
)'
return
repr_str
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/__init__.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/__init__.py
View file @
f3b13cad
from
.backbones
import
*
from
.heads
import
*
from
.losses
import
*
from
.mapers
import
*
from
.transformer_utils
import
*
from
.assigner
import
*
from
.backbones
import
*
from
.heads
import
*
from
.losses
import
*
from
.mapers
import
*
from
.transformer_utils
import
*
from
.assigner
import
*
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/assigner/__init__.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/assigner/__init__.py
View file @
f3b13cad
from
.assigner
import
HungarianLinesAssigner
from
.match_cost
import
MapQueriesCost
,
BBoxLogitsCost
,
DynamicLinesCost
,
IoUCostC
,
BBoxCostC
,
LinesCost
,
LinesFixNumChamferCost
,
ClsSigmoidCost
from
.assigner
import
HungarianLinesAssigner
from
.match_cost
import
MapQueriesCost
,
BBoxLogitsCost
,
DynamicLinesCost
,
IoUCostC
,
BBoxCostC
,
LinesCost
,
LinesFixNumChamferCost
,
ClsSigmoidCost
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/assigner/assigner.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/assigner/assigner.py
View file @
f3b13cad
import
torch
from
mmdet.core.bbox.builder
import
BBOX_ASSIGNERS
from
mmdet.core.bbox.assigners
import
AssignResult
from
mmdet.core.bbox.assigners
import
BaseAssigner
from
mmdet.core.bbox.match_costs
import
build_match_cost
try
:
from
scipy.optimize
import
linear_sum_assignment
except
ImportError
:
linear_sum_assignment
=
None
@
BBOX_ASSIGNERS
.
register_module
()
class
HungarianLinesAssigner
(
BaseAssigner
):
"""
Computes one-to-one matching between predictions and ground truth.
This class computes an assignment between the targets and the predictions
based on the costs. The costs are weighted sum of three components:
classification cost and regression L1 cost. The
targets don't include the no_object, so generally there are more
predictions than targets. After the one-to-one matching, the un-matched
are treated as backgrounds. Thus each query prediction will be assigned
with `0` or a positive integer indicating the ground truth index:
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
Args:
cls_weight (int | float, optional): The scale factor for classification
cost. Default 1.0.
bbox_weight (int | float, optional): The scale factor for regression
L1 cost. Default 1.0.
"""
def
__init__
(
self
,
cost
=
dict
(
type
=
'MapQueriesCost'
,
cls_cost
=
dict
(
type
=
'ClassificationCost'
,
weight
=
1.
),
reg_cost
=
dict
(
type
=
'LinesCost'
,
weight
=
1.0
),
),
pc_range
=
None
,
**
kwargs
):
self
.
pc_range
=
pc_range
self
.
cost
=
build_match_cost
(
cost
)
def
assign
(
self
,
preds
:
dict
,
gts
:
dict
,
gt_bboxes_ignore
=
None
,
eps
=
1e-7
):
"""
Computes one-to-one matching based on the weighted costs.
This method assign each query prediction to a ground truth or
background. The `assigned_gt_inds` with -1 means don't care,
0 means negative sample, and positive number is the index (1-based)
of assigned gt.
The assignment is done in the following steps, the order matters.
1. assign every prediction to -1
2. compute the weighted costs
3. do Hungarian matching on CPU based on the costs
4. assign all to 0 (background) first, then for each matched pair
between predictions and gts, treat this prediction as foreground
and assign the corresponding gt index (plus 1) to it.
Args:
lines_pred (Tensor): predicted normalized lines:
[num_query, num_points, 2]
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
Note: when compute bbox l1 loss, velocity is not included!!
lines_gt (Tensor): Ground truth lines
[num_gt, num_points, 2].
labels_gt (Tensor): Label of `gt_bboxes`, shape (num_gt,).
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`. Default None.
eps (int | float, optional): A value added to the denominator for
numerical stability. Default 1e-7.
Returns:
:obj:`AssignResult`: The assigned result.
"""
assert
gt_bboxes_ignore
is
None
,
\
'Only case when gt_bboxes_ignore is None is supported.'
num_gts
,
num_lines
=
gts
[
'lines'
].
size
(
0
),
preds
[
'lines'
].
size
(
0
)
# 1. assign -1 by default
assigned_gt_inds
=
\
preds
[
'lines'
].
new_full
((
num_lines
,),
-
1
,
dtype
=
torch
.
long
)
assigned_labels
=
\
preds
[
'lines'
].
new_full
((
num_lines
,),
-
1
,
dtype
=
torch
.
long
)
if
num_gts
==
0
or
num_lines
==
0
:
# No ground truth or boxes, return empty assignment
if
num_gts
==
0
:
# No ground truth, assign all to background
assigned_gt_inds
[:]
=
0
return
AssignResult
(
num_gts
,
assigned_gt_inds
,
None
,
labels
=
assigned_labels
)
# 2. compute the weighted costs
cost
=
self
.
cost
(
preds
,
gts
)
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost
=
cost
.
detach
().
cpu
().
numpy
()
if
linear_sum_assignment
is
None
:
raise
ImportError
(
'Please run "pip install scipy" '
'to install scipy first.'
)
try
:
matched_row_inds
,
matched_col_inds
=
linear_sum_assignment
(
cost
)
except
:
print
(
'cost max{}, min{}'
.
format
(
cost
.
max
(),
cost
.
min
()))
import
ipdb
;
ipdb
.
set_trace
()
matched_row_inds
=
torch
.
from_numpy
(
matched_row_inds
).
to
(
preds
[
'lines'
].
device
)
matched_col_inds
=
torch
.
from_numpy
(
matched_col_inds
).
to
(
preds
[
'lines'
].
device
)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds
[:]
=
0
# assign foregrounds based on matching results
assigned_gt_inds
[
matched_row_inds
]
=
matched_col_inds
+
1
assigned_labels
[
matched_row_inds
]
=
gts
[
'labels'
][
matched_col_inds
]
return
AssignResult
(
import
torch
from
mmdet.core.bbox.builder
import
BBOX_ASSIGNERS
from
mmdet.core.bbox.assigners
import
AssignResult
from
mmdet.core.bbox.assigners
import
BaseAssigner
from
mmdet.core.bbox.match_costs
import
build_match_cost
try
:
from
scipy.optimize
import
linear_sum_assignment
except
ImportError
:
linear_sum_assignment
=
None
@
BBOX_ASSIGNERS
.
register_module
()
class
HungarianLinesAssigner
(
BaseAssigner
):
"""
Computes one-to-one matching between predictions and ground truth.
This class computes an assignment between the targets and the predictions
based on the costs. The costs are weighted sum of three components:
classification cost and regression L1 cost. The
targets don't include the no_object, so generally there are more
predictions than targets. After the one-to-one matching, the un-matched
are treated as backgrounds. Thus each query prediction will be assigned
with `0` or a positive integer indicating the ground truth index:
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
Args:
cls_weight (int | float, optional): The scale factor for classification
cost. Default 1.0.
bbox_weight (int | float, optional): The scale factor for regression
L1 cost. Default 1.0.
"""
def
__init__
(
self
,
cost
=
dict
(
type
=
'MapQueriesCost'
,
cls_cost
=
dict
(
type
=
'ClassificationCost'
,
weight
=
1.
),
reg_cost
=
dict
(
type
=
'LinesCost'
,
weight
=
1.0
),
),
pc_range
=
None
,
**
kwargs
):
self
.
pc_range
=
pc_range
self
.
cost
=
build_match_cost
(
cost
)
def
assign
(
self
,
preds
:
dict
,
gts
:
dict
,
gt_bboxes_ignore
=
None
,
eps
=
1e-7
):
"""
Computes one-to-one matching based on the weighted costs.
This method assign each query prediction to a ground truth or
background. The `assigned_gt_inds` with -1 means don't care,
0 means negative sample, and positive number is the index (1-based)
of assigned gt.
The assignment is done in the following steps, the order matters.
1. assign every prediction to -1
2. compute the weighted costs
3. do Hungarian matching on CPU based on the costs
4. assign all to 0 (background) first, then for each matched pair
between predictions and gts, treat this prediction as foreground
and assign the corresponding gt index (plus 1) to it.
Args:
lines_pred (Tensor): predicted normalized lines:
[num_query, num_points, 2]
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
Note: when compute bbox l1 loss, velocity is not included!!
lines_gt (Tensor): Ground truth lines
[num_gt, num_points, 2].
labels_gt (Tensor): Label of `gt_bboxes`, shape (num_gt,).
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`. Default None.
eps (int | float, optional): A value added to the denominator for
numerical stability. Default 1e-7.
Returns:
:obj:`AssignResult`: The assigned result.
"""
assert
gt_bboxes_ignore
is
None
,
\
'Only case when gt_bboxes_ignore is None is supported.'
num_gts
,
num_lines
=
gts
[
'lines'
].
size
(
0
),
preds
[
'lines'
].
size
(
0
)
# 1. assign -1 by default
assigned_gt_inds
=
\
preds
[
'lines'
].
new_full
((
num_lines
,),
-
1
,
dtype
=
torch
.
long
)
assigned_labels
=
\
preds
[
'lines'
].
new_full
((
num_lines
,),
-
1
,
dtype
=
torch
.
long
)
if
num_gts
==
0
or
num_lines
==
0
:
# No ground truth or boxes, return empty assignment
if
num_gts
==
0
:
# No ground truth, assign all to background
assigned_gt_inds
[:]
=
0
return
AssignResult
(
num_gts
,
assigned_gt_inds
,
None
,
labels
=
assigned_labels
)
# 2. compute the weighted costs
cost
=
self
.
cost
(
preds
,
gts
)
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost
=
cost
.
detach
().
cpu
().
numpy
()
if
linear_sum_assignment
is
None
:
raise
ImportError
(
'Please run "pip install scipy" '
'to install scipy first.'
)
try
:
matched_row_inds
,
matched_col_inds
=
linear_sum_assignment
(
cost
)
except
:
print
(
'cost max{}, min{}'
.
format
(
cost
.
max
(),
cost
.
min
()))
import
ipdb
;
ipdb
.
set_trace
()
matched_row_inds
=
torch
.
from_numpy
(
matched_row_inds
).
to
(
preds
[
'lines'
].
device
)
matched_col_inds
=
torch
.
from_numpy
(
matched_col_inds
).
to
(
preds
[
'lines'
].
device
)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds
[:]
=
0
# assign foregrounds based on matching results
assigned_gt_inds
[
matched_row_inds
]
=
matched_col_inds
+
1
assigned_labels
[
matched_row_inds
]
=
gts
[
'labels'
][
matched_col_inds
]
return
AssignResult
(
num_gts
,
assigned_gt_inds
,
None
,
labels
=
assigned_labels
)
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/assigner/match_cost.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/assigner/match_cost.py
View file @
f3b13cad
import
torch
from
mmdet.core.bbox.match_costs.builder
import
MATCH_COST
from
mmdet.core.bbox.match_costs
import
build_match_cost
from
mmdet.core.bbox.iou_calculators
import
bbox_overlaps
from
mmdet.core.bbox.transforms
import
bbox_cxcywh_to_xyxy
def
chamfer_distance
(
pred
,
gt
):
'''
Args:
pred: [num_points, 2]
gt: [num_gt, 2]
Out: torch.FloatTensor of shape (1, )
'''
# [num_points, num_gt]
dist_mat
=
torch
.
cdist
(
pred
,
gt
,
p
=
2
)
# [num_points]
dist_pred
,
_
=
torch
.
min
(
dist_mat
,
dim
=-
1
)
dist_pred
=
torch
.
clamp
(
dist_pred
,
max
=
2.0
)
dist_pred
=
dist_pred
.
mean
()
dist_gt
,
_
=
torch
.
min
(
dist_mat
,
dim
=
0
)
dist_gt
=
torch
.
clamp
(
dist_gt
,
max
=
2.0
)
dist_gt
=
dist_gt
.
mean
()
dist
=
dist_pred
+
dist_gt
return
dist
@
MATCH_COST
.
register_module
()
class
ClsSigmoidCost
:
"""ClsSoftmaxCost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
cls_pred
,
gt_labels
):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
torch.Tensor: cls_cost value with weight
"""
# Following the official DETR repo, contrary to the loss that
# NLL is used, we approximate it in 1 - cls_score[gt_label].
# The 1 is a constant that doesn't change the matching,
# so it can be omitted.
cls_score
=
cls_pred
.
sigmoid
()
cls_cost
=
-
cls_score
[:,
gt_labels
]
return
cls_cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
LinesFixNumChamferCost
(
object
):
"""BBox3DL1Cost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
lines_pred
,
gt_lines
):
"""
Args:
lines_pred (Tensor): predicted normalized lines:
[num_query, num_points, 2]
gt_lines (Tensor): Ground truth lines
[num_gt, num_points, 2]
Returns:
torch.Tensor: reg_cost value with weight
shape [num_pred, num_gt]
"""
num_gts
,
num_bboxes
=
gt_lines
.
size
(
0
),
lines_pred
.
size
(
0
)
dist_mat
=
lines_pred
.
new_full
((
num_bboxes
,
num_gts
),
1.0
,)
for
i
in
range
(
num_bboxes
):
for
j
in
range
(
num_gts
):
dist_mat
[
i
,
j
]
=
chamfer_distance
(
lines_pred
[
i
],
gt_lines
[
j
])
return
dist_mat
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
LinesCost
(
object
):
"""LinesL1Cost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
lines_pred
,
gt_lines
,
**
kwargs
):
"""
Args:
lines_pred (Tensor): predicted normalized lines:
[num_query, num_points, 2]
gt_lines (Tensor): Ground truth lines
[num_gt, num_points, 2]
Returns:
torch.Tensor: reg_cost value with weight
shape [num_pred, num_gt]
"""
gt_revser
=
torch
.
flip
(
gt_lines
,
dims
=
[
-
2
])
gt_revser_flat
=
gt_revser
.
flatten
(
1
,
2
)
pred_flat
=
lines_pred
.
flatten
(
1
,
2
)
gt_flat
=
gt_lines
.
flatten
(
1
,
2
)
div_
=
pred_flat
.
size
(
-
1
)
dist_mat
=
torch
.
cdist
(
pred_flat
,
gt_flat
,
p
=
1
)
/
div_
return
dist_mat
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
BBoxCostC
:
"""BBoxL1Cost.
Args:
weight (int | float, optional): loss_weight
box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
>>> import torch
>>> self = BBoxL1Cost()
>>> bbox_pred = torch.rand(1, 4)
>>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
>>> factor = torch.tensor([10, 8, 10, 8])
>>> self(bbox_pred, gt_bboxes, factor)
tensor([[1.6172, 1.6422]])
"""
def
__init__
(
self
,
weight
=
1.
,
box_format
=
'xyxy'
):
self
.
weight
=
weight
assert
box_format
in
[
'xyxy'
,
'xywh'
]
self
.
box_format
=
box_format
def
__call__
(
self
,
bbox_pred
,
gt_bboxes
):
"""
Args:
bbox_pred (Tensor): Predicted boxes with normalized coordinates
(cx, cy, w, h), which are all in range [0, 1]. Shape
[num_query, 4].
gt_bboxes (Tensor): Ground truth boxes with normalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
Returns:
torch.Tensor: bbox_cost value with weight
"""
# if self.box_format == 'xywh':
# gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
# elif self.box_format == 'xyxy':
# bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
bbox_cost
=
torch
.
cdist
(
bbox_pred
,
gt_bboxes
,
p
=
1
)
return
bbox_cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
IoUCostC
:
"""IoUCost.
Args:
iou_mode (str, optional): iou mode such as 'iou' | 'giou'
weight (int | float, optional): loss weight
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
>>> import torch
>>> self = IoUCost()
>>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
>>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
>>> self(bboxes, gt_bboxes)
tensor([[-0.1250, 0.1667],
[ 0.1667, -0.5000]])
"""
def
__init__
(
self
,
iou_mode
=
'giou'
,
weight
=
1.
,
box_format
=
'xywh'
):
self
.
weight
=
weight
self
.
iou_mode
=
iou_mode
assert
box_format
in
[
'xyxy'
,
'xywh'
]
self
.
box_format
=
box_format
def
__call__
(
self
,
bboxes
,
gt_bboxes
):
"""
Args:
bboxes (Tensor): Predicted boxes with unnormalized coordinates
(x1, y1, x2, y2). Shape [num_query, 4].
gt_bboxes (Tensor): Ground truth boxes with unnormalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
Returns:
torch.Tensor: iou_cost value with weight
"""
if
self
.
box_format
==
'xywh'
:
bboxes
=
bbox_cxcywh_to_xyxy
(
bboxes
)
gt_bboxes
=
bbox_cxcywh_to_xyxy
(
gt_bboxes
)
# overlaps: [num_bboxes, num_gt]
overlaps
=
bbox_overlaps
(
bboxes
,
gt_bboxes
,
mode
=
self
.
iou_mode
,
is_aligned
=
False
)
# The 1 is a constant that doesn't change the matching, so omitted.
iou_cost
=
-
overlaps
return
iou_cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
DynamicLinesCost
(
object
):
"""LinesL1Cost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
lines_pred
,
lines_gt
,
masks_pred
,
masks_gt
):
"""
Args:
lines_pred (Tensor): predicted normalized lines:
[nP, num_points, 2]
lines_gt (Tensor): Ground truth lines
[nG, num_points, 2]
masks_pred: [nP, num_points]
masks_gt: [nG, num_points]
Returns:
dist_mat: reg_cost value with weight
shape [nP, nG]
"""
dist_mat
=
self
.
cal_dist
(
lines_pred
,
lines_gt
)
dist_mat
=
self
.
get_dynamic_line
(
dist_mat
,
masks_pred
,
masks_gt
)
dist_mat
=
dist_mat
*
self
.
weight
return
dist_mat
def
cal_dist
(
self
,
x1
,
x2
):
'''
Args:
x1: B1,N,2
x2: B2,N,2
Return:
dist_mat: B1,B2,N
'''
x1
=
x1
.
permute
(
1
,
0
,
2
)
x2
=
x2
.
permute
(
1
,
0
,
2
)
dist_mat
=
torch
.
cdist
(
x1
,
x2
,
p
=
2
)
dist_mat
=
dist_mat
.
permute
(
1
,
2
,
0
)
return
dist_mat
def
get_dynamic_line
(
self
,
mat
,
m1
,
m2
):
'''
get dynamic line with difference approach
mat: N1xN2xnpts
m1: N1xnpts
m2: N2xnpts
'''
# nPxnGxnum_points
m1
=
m1
.
unsqueeze
(
1
).
sigmoid
()
>
0.5
m2
=
m2
.
unsqueeze
(
0
)
valid_points_mask
=
(
m1
+
m2
)
/
2.
average_factor_mask
=
valid_points_mask
.
sum
(
-
1
)
>
0
average_factor
=
average_factor_mask
.
masked_fill
(
~
average_factor_mask
,
1
)
# takes the average
mat
=
mat
*
valid_points_mask
mat
=
mat
.
sum
(
-
1
)
/
average_factor
return
mat
@
MATCH_COST
.
register_module
()
class
BBoxLogitsCost
(
object
):
"""BBoxLogits.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
calNLL
(
self
,
logits
,
value
):
'''
Args:
logits: B1, 8, cls_dim
value: B2, 8,
Return:
log_likelihood: B1,B2,8
'''
logits
=
logits
[:,
None
]
value
=
value
[
None
]
value
=
value
.
long
().
unsqueeze
(
-
1
)
value
,
log_pmf
=
torch
.
broadcast_tensors
(
value
,
logits
)
value
=
value
[...,
:
1
]
return
log_pmf
.
gather
(
-
1
,
value
).
squeeze
(
-
1
)
def
__call__
(
self
,
bbox_pred
,
bbox_gt
,
**
kwargs
):
"""
Args:
bbox_pred: nproposal, 4*2, pos_dim
bbox_gt: ngt, 4*2
Returns:
cost: nproposal, ngt
"""
cost
=
self
.
calNLL
(
bbox_pred
,
bbox_gt
).
mean
(
-
1
)
return
cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
MapQueriesCost
(
object
):
def
__init__
(
self
,
cls_cost
,
reg_cost
,
iou_cost
=
None
):
self
.
cls_cost
=
build_match_cost
(
cls_cost
)
self
.
reg_cost
=
build_match_cost
(
reg_cost
)
self
.
iou_cost
=
None
if
iou_cost
is
not
None
:
self
.
iou_cost
=
build_match_cost
(
iou_cost
)
def
__call__
(
self
,
preds
:
dict
,
gts
:
dict
):
# classification and bboxcost.
cls_cost
=
self
.
cls_cost
(
preds
[
'scores'
],
gts
[
'labels'
])
# regression cost
regkwargs
=
{}
if
'masks'
in
preds
and
'masks'
in
gts
:
assert
isinstance
(
self
.
reg_cost
,
DynamicLinesCost
),
' Issues!!'
regkwargs
=
{
'masks_pred'
:
preds
[
'masks'
],
'masks_gt'
:
gts
[
'masks'
],
}
reg_cost
=
self
.
reg_cost
(
preds
[
'lines'
],
gts
[
'lines'
],
**
regkwargs
)
# weighted sum of above three costs
cost
=
cls_cost
+
reg_cost
# Iou
if
self
.
iou_cost
is
not
None
:
iou_cost
=
self
.
iou_cost
(
preds
[
'lines'
],
gts
[
'lines'
])
cost
+=
iou_cost
return
cost
import
torch
from
mmdet.core.bbox.match_costs.builder
import
MATCH_COST
from
mmdet.core.bbox.match_costs
import
build_match_cost
from
mmdet.core.bbox.iou_calculators
import
bbox_overlaps
from
mmdet.core.bbox.transforms
import
bbox_cxcywh_to_xyxy
def
chamfer_distance
(
pred
,
gt
):
'''
Args:
pred: [num_points, 2]
gt: [num_gt, 2]
Out: torch.FloatTensor of shape (1, )
'''
# [num_points, num_gt]
dist_mat
=
torch
.
cdist
(
pred
,
gt
,
p
=
2
)
# [num_points]
dist_pred
,
_
=
torch
.
min
(
dist_mat
,
dim
=-
1
)
dist_pred
=
torch
.
clamp
(
dist_pred
,
max
=
2.0
)
dist_pred
=
dist_pred
.
mean
()
dist_gt
,
_
=
torch
.
min
(
dist_mat
,
dim
=
0
)
dist_gt
=
torch
.
clamp
(
dist_gt
,
max
=
2.0
)
dist_gt
=
dist_gt
.
mean
()
dist
=
dist_pred
+
dist_gt
return
dist
@
MATCH_COST
.
register_module
()
class
ClsSigmoidCost
:
"""ClsSoftmaxCost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
cls_pred
,
gt_labels
):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
torch.Tensor: cls_cost value with weight
"""
# Following the official DETR repo, contrary to the loss that
# NLL is used, we approximate it in 1 - cls_score[gt_label].
# The 1 is a constant that doesn't change the matching,
# so it can be omitted.
cls_score
=
cls_pred
.
sigmoid
()
cls_cost
=
-
cls_score
[:,
gt_labels
]
return
cls_cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
LinesFixNumChamferCost
(
object
):
"""BBox3DL1Cost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
lines_pred
,
gt_lines
):
"""
Args:
lines_pred (Tensor): predicted normalized lines:
[num_query, num_points, 2]
gt_lines (Tensor): Ground truth lines
[num_gt, num_points, 2]
Returns:
torch.Tensor: reg_cost value with weight
shape [num_pred, num_gt]
"""
num_gts
,
num_bboxes
=
gt_lines
.
size
(
0
),
lines_pred
.
size
(
0
)
dist_mat
=
lines_pred
.
new_full
((
num_bboxes
,
num_gts
),
1.0
,)
for
i
in
range
(
num_bboxes
):
for
j
in
range
(
num_gts
):
dist_mat
[
i
,
j
]
=
chamfer_distance
(
lines_pred
[
i
],
gt_lines
[
j
])
return
dist_mat
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
LinesCost
(
object
):
"""LinesL1Cost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
lines_pred
,
gt_lines
,
**
kwargs
):
"""
Args:
lines_pred (Tensor): predicted normalized lines:
[num_query, num_points, 2]
gt_lines (Tensor): Ground truth lines
[num_gt, num_points, 2]
Returns:
torch.Tensor: reg_cost value with weight
shape [num_pred, num_gt]
"""
gt_revser
=
torch
.
flip
(
gt_lines
,
dims
=
[
-
2
])
gt_revser_flat
=
gt_revser
.
flatten
(
1
,
2
)
pred_flat
=
lines_pred
.
flatten
(
1
,
2
)
gt_flat
=
gt_lines
.
flatten
(
1
,
2
)
div_
=
pred_flat
.
size
(
-
1
)
dist_mat
=
torch
.
cdist
(
pred_flat
,
gt_flat
,
p
=
1
)
/
div_
return
dist_mat
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
BBoxCostC
:
"""BBoxL1Cost.
Args:
weight (int | float, optional): loss_weight
box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
>>> import torch
>>> self = BBoxL1Cost()
>>> bbox_pred = torch.rand(1, 4)
>>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
>>> factor = torch.tensor([10, 8, 10, 8])
>>> self(bbox_pred, gt_bboxes, factor)
tensor([[1.6172, 1.6422]])
"""
def
__init__
(
self
,
weight
=
1.
,
box_format
=
'xyxy'
):
self
.
weight
=
weight
assert
box_format
in
[
'xyxy'
,
'xywh'
]
self
.
box_format
=
box_format
def
__call__
(
self
,
bbox_pred
,
gt_bboxes
):
"""
Args:
bbox_pred (Tensor): Predicted boxes with normalized coordinates
(cx, cy, w, h), which are all in range [0, 1]. Shape
[num_query, 4].
gt_bboxes (Tensor): Ground truth boxes with normalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
Returns:
torch.Tensor: bbox_cost value with weight
"""
# if self.box_format == 'xywh':
# gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
# elif self.box_format == 'xyxy':
# bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
bbox_cost
=
torch
.
cdist
(
bbox_pred
,
gt_bboxes
,
p
=
1
)
return
bbox_cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
IoUCostC
:
"""IoUCost.
Args:
iou_mode (str, optional): iou mode such as 'iou' | 'giou'
weight (int | float, optional): loss weight
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
>>> import torch
>>> self = IoUCost()
>>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
>>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
>>> self(bboxes, gt_bboxes)
tensor([[-0.1250, 0.1667],
[ 0.1667, -0.5000]])
"""
def
__init__
(
self
,
iou_mode
=
'giou'
,
weight
=
1.
,
box_format
=
'xywh'
):
self
.
weight
=
weight
self
.
iou_mode
=
iou_mode
assert
box_format
in
[
'xyxy'
,
'xywh'
]
self
.
box_format
=
box_format
def
__call__
(
self
,
bboxes
,
gt_bboxes
):
"""
Args:
bboxes (Tensor): Predicted boxes with unnormalized coordinates
(x1, y1, x2, y2). Shape [num_query, 4].
gt_bboxes (Tensor): Ground truth boxes with unnormalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
Returns:
torch.Tensor: iou_cost value with weight
"""
if
self
.
box_format
==
'xywh'
:
bboxes
=
bbox_cxcywh_to_xyxy
(
bboxes
)
gt_bboxes
=
bbox_cxcywh_to_xyxy
(
gt_bboxes
)
# overlaps: [num_bboxes, num_gt]
overlaps
=
bbox_overlaps
(
bboxes
,
gt_bboxes
,
mode
=
self
.
iou_mode
,
is_aligned
=
False
)
# The 1 is a constant that doesn't change the matching, so omitted.
iou_cost
=
-
overlaps
return
iou_cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
DynamicLinesCost
(
object
):
"""LinesL1Cost.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
__call__
(
self
,
lines_pred
,
lines_gt
,
masks_pred
,
masks_gt
):
"""
Args:
lines_pred (Tensor): predicted normalized lines:
[nP, num_points, 2]
lines_gt (Tensor): Ground truth lines
[nG, num_points, 2]
masks_pred: [nP, num_points]
masks_gt: [nG, num_points]
Returns:
dist_mat: reg_cost value with weight
shape [nP, nG]
"""
dist_mat
=
self
.
cal_dist
(
lines_pred
,
lines_gt
)
dist_mat
=
self
.
get_dynamic_line
(
dist_mat
,
masks_pred
,
masks_gt
)
dist_mat
=
dist_mat
*
self
.
weight
return
dist_mat
def
cal_dist
(
self
,
x1
,
x2
):
'''
Args:
x1: B1,N,2
x2: B2,N,2
Return:
dist_mat: B1,B2,N
'''
x1
=
x1
.
permute
(
1
,
0
,
2
)
x2
=
x2
.
permute
(
1
,
0
,
2
)
dist_mat
=
torch
.
cdist
(
x1
,
x2
,
p
=
2
)
dist_mat
=
dist_mat
.
permute
(
1
,
2
,
0
)
return
dist_mat
def
get_dynamic_line
(
self
,
mat
,
m1
,
m2
):
'''
get dynamic line with difference approach
mat: N1xN2xnpts
m1: N1xnpts
m2: N2xnpts
'''
# nPxnGxnum_points
m1
=
m1
.
unsqueeze
(
1
).
sigmoid
()
>
0.5
m2
=
m2
.
unsqueeze
(
0
)
valid_points_mask
=
(
m1
+
m2
)
/
2.
average_factor_mask
=
valid_points_mask
.
sum
(
-
1
)
>
0
average_factor
=
average_factor_mask
.
masked_fill
(
~
average_factor_mask
,
1
)
# takes the average
mat
=
mat
*
valid_points_mask
mat
=
mat
.
sum
(
-
1
)
/
average_factor
return
mat
@
MATCH_COST
.
register_module
()
class
BBoxLogitsCost
(
object
):
"""BBoxLogits.
Args:
weight (int | float, optional): loss_weight
"""
def
__init__
(
self
,
weight
=
1.
):
self
.
weight
=
weight
def
calNLL
(
self
,
logits
,
value
):
'''
Args:
logits: B1, 8, cls_dim
value: B2, 8,
Return:
log_likelihood: B1,B2,8
'''
logits
=
logits
[:,
None
]
value
=
value
[
None
]
value
=
value
.
long
().
unsqueeze
(
-
1
)
value
,
log_pmf
=
torch
.
broadcast_tensors
(
value
,
logits
)
value
=
value
[...,
:
1
]
return
log_pmf
.
gather
(
-
1
,
value
).
squeeze
(
-
1
)
def
__call__
(
self
,
bbox_pred
,
bbox_gt
,
**
kwargs
):
"""
Args:
bbox_pred: nproposal, 4*2, pos_dim
bbox_gt: ngt, 4*2
Returns:
cost: nproposal, ngt
"""
cost
=
self
.
calNLL
(
bbox_pred
,
bbox_gt
).
mean
(
-
1
)
return
cost
*
self
.
weight
@
MATCH_COST
.
register_module
()
class
MapQueriesCost
(
object
):
def
__init__
(
self
,
cls_cost
,
reg_cost
,
iou_cost
=
None
):
self
.
cls_cost
=
build_match_cost
(
cls_cost
)
self
.
reg_cost
=
build_match_cost
(
reg_cost
)
self
.
iou_cost
=
None
if
iou_cost
is
not
None
:
self
.
iou_cost
=
build_match_cost
(
iou_cost
)
def
__call__
(
self
,
preds
:
dict
,
gts
:
dict
):
# classification and bboxcost.
cls_cost
=
self
.
cls_cost
(
preds
[
'scores'
],
gts
[
'labels'
])
# regression cost
regkwargs
=
{}
if
'masks'
in
preds
and
'masks'
in
gts
:
assert
isinstance
(
self
.
reg_cost
,
DynamicLinesCost
),
' Issues!!'
regkwargs
=
{
'masks_pred'
:
preds
[
'masks'
],
'masks_gt'
:
gts
[
'masks'
],
}
reg_cost
=
self
.
reg_cost
(
preds
[
'lines'
],
gts
[
'lines'
],
**
regkwargs
)
# weighted sum of above three costs
cost
=
cls_cost
+
reg_cost
# Iou
if
self
.
iou_cost
is
not
None
:
iou_cost
=
self
.
iou_cost
(
preds
[
'lines'
],
gts
[
'lines'
])
cost
+=
iou_cost
return
cost
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/augmentation/__init__.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/augmentation/__init__.py
View file @
f3b13cad
File moved
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/augmentation/sythesis_det.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/augmentation/sythesis_det.py
View file @
f3b13cad
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
class
NoiseSythesis
(
nn
.
Module
):
def
__init__
(
self
,
p
,
scale
=
0.01
,
shift_scale
=
(
8
,
5
),
scaling_size
=
(
0.1
,
0.1
),
canvas_size
=
(
200
,
100
),
bbox_type
=
'sce'
,
poly_coord_dim
=
2
,
bbox_coord_dim
=
2
,
quantify
=
True
):
super
(
NoiseSythesis
,
self
).
__init__
()
self
.
p
=
p
self
.
scale
=
scale
self
.
bbox_type
=
bbox_type
self
.
quantify
=
quantify
self
.
poly_coord_dim
=
poly_coord_dim
self
.
bbox_coord_dim
=
bbox_coord_dim
self
.
transforms
=
[
self
.
random_shifting
,
self
.
random_scaling
]
# self.transforms = [self.random_scaling]
self
.
register_buffer
(
'canvas_size'
,
torch
.
tensor
(
canvas_size
))
self
.
register_buffer
(
'shift_scale'
,
torch
.
tensor
(
shift_scale
).
float
())
self
.
register_buffer
(
'scaling_size'
,
torch
.
tensor
(
scaling_size
))
def
random_scaling
(
self
,
bbox
):
'''
bbox: B, paramter_num, 2
'''
device
=
bbox
.
device
dtype
=
bbox
.
dtype
B
=
bbox
.
shape
[
0
]
noise
=
(
torch
.
rand
(
B
,
device
=
device
)
*
2
-
1
)[:,
None
,
None
]
# [-1,1]
scale
=
self
.
scaling_size
.
to
(
device
)
scale
=
(
noise
*
scale
)
+
1
scaled_bbox
=
bbox
*
scale
# recenterization
coffset
=
scaled_bbox
.
mean
(
-
2
)
-
bbox
.
float
().
mean
(
-
2
)
scaled_bbox
=
scaled_bbox
-
coffset
[:,
None
]
return
scaled_bbox
.
round
().
type
(
dtype
)
def
random_shifting
(
self
,
bbox
):
'''
bbox: B, paramter_num, 2
'''
device
=
bbox
.
device
batch_size
=
bbox
.
shape
[
0
]
shift_scale
=
self
.
shift_scale
scale
=
(
bbox
.
max
(
1
)[
0
]
-
bbox
.
min
(
1
)[
0
])
*
0.1
scale
=
torch
.
where
(
scale
<
shift_scale
,
scale
,
shift_scale
)
noise
=
(
torch
.
rand
(
batch_size
,
2
,
device
=
device
)
*
2
-
1
)
# [-1,1]
offset
=
(
noise
*
scale
).
round
().
type
(
bbox
.
dtype
)
shifted_bbox
=
bbox
+
offset
[:,
None
]
return
shifted_bbox
def
gaussian_noise_bbox
(
self
,
bbox
):
dtype
=
bbox
.
dtype
batch_size
=
bbox
.
shape
[
0
]
scale
=
(
self
.
canvas_size
*
self
.
scale
)[:
self
.
bbox_coord_dim
]
noisy_bbox
=
torch
.
normal
(
bbox
.
type
(
torch
.
float
),
scale
)
if
self
.
quantify
:
noisy_bbox
=
noisy_bbox
.
round
().
type
(
dtype
)
# prevent out of bound case
for
i
in
range
(
self
.
bbox_coord_dim
):
noisy_bbox
[...,
i
]
=
\
torch
.
clamp
(
noisy_bbox
[...,
0
],
1
,
self
.
canvas_size
[
i
])
else
:
noisy_bbox
=
noisy_bbox
.
type
(
torch
.
float
)
return
noisy_bbox
def
gaussian_noise_poly
(
self
,
polyline
,
polyline_mask
):
device
=
polyline
.
device
batchsize
=
polyline
.
shape
[
0
]
scale
=
self
.
canvas_size
*
self
.
scale
polyline
=
F
.
pad
(
polyline
,(
0
,
self
.
poly_coord_dim
-
1
))
polyline
=
polyline
.
view
(
batchsize
,
-
1
,
self
.
poly_coord_dim
)
mask
=
F
.
pad
(
polyline_mask
[:,
1
:],(
0
,
self
.
poly_coord_dim
))
noisy_polyline
=
torch
.
normal
(
polyline
.
type
(
torch
.
float
),
scale
)
if
self
.
quantify
:
noisy_polyline
=
noisy_polyline
.
round
().
type
(
polyline
.
dtype
)
# prevent out of bound case
for
i
in
range
(
self
.
poly_coord_dim
):
noisy_polyline
[...,
i
]
=
\
torch
.
clamp
(
noisy_polyline
[...,
i
],
0
,
self
.
canvas_size
[
i
])
else
:
noisy_polyline
=
noisy_polyline
.
type
(
torch
.
float
)
noisy_polyline
=
noisy_polyline
.
view
(
batchsize
,
-
1
)
*
mask
noisy_polyline
=
noisy_polyline
[:,:
-
(
self
.
poly_coord_dim
-
1
)]
return
noisy_polyline
def
random_apply
(
self
,
bbox
):
for
t
in
self
.
transforms
:
if
self
.
p
<
torch
.
rand
(
1
):
continue
bbox
=
t
(
bbox
)
# prevent out of bound case
bbox
[...,
0
]
=
\
torch
.
clamp
(
bbox
[...,
0
],
0
,
self
.
canvas_size
[
0
])
bbox
[...,
1
]
=
\
torch
.
clamp
(
bbox
[...,
1
],
0
,
self
.
canvas_size
[
1
])
return
bbox
def
simple_aug
(
self
,
batch
):
# augment bbox
if
self
.
bbox_type
in
[
'sce'
,
'xyxy'
]:
fbbox
=
batch
[
'bbox_flat'
]
seq_len
=
fbbox
.
shape
[
0
]
bbox
=
fbbox
.
view
(
seq_len
,
-
1
,
2
)
bbox
=
self
.
gaussian_noise_bbox
(
bbox
)
fbbox_aug
=
bbox
.
view
(
seq_len
,
-
1
)
aug_mask
=
torch
.
rand
(
fbbox
.
shape
,
device
=
fbbox
.
device
)
fbbox
=
torch
.
where
(
aug_mask
<
self
.
p
,
fbbox_aug
,
fbbox
)
elif
self
.
bbox_type
==
'rxyxy'
:
fbbox
=
self
.
rbbox_aug
(
batch
)
elif
self
.
bbox_type
==
'convex_hull'
:
fbbox
=
self
.
convex_hull_aug
(
batch
)
# augment
polyline
=
batch
[
'polylines'
]
polyline_mask
=
batch
[
'polyline_masks'
]
polyline_aug
=
self
.
gaussian_noise_poly
(
polyline
,
polyline_mask
)
aug_mask
=
torch
.
rand
(
polyline
.
shape
,
device
=
polyline
.
device
)
polyline
=
torch
.
where
(
aug_mask
<
self
.
p
,
polyline_aug
,
polyline
)
return
polyline
,
fbbox
def
rbbox_aug
(
self
,
batch
):
return
None
def
convex_hull_aug
(
self
,
batch
):
return
None
def
__call__
(
self
,
batch
,
simple_aug
=
False
):
if
simple_aug
:
return
self
.
simple_aug
(
batch
)
else
:
fbbox
=
batch
[
'bbox_flat'
]
seq_len
=
fbbox
.
shape
[
0
]
bbox
=
fbbox
.
view
(
seq_len
,
-
1
,
self
.
bbox_coord_dim
)
aug_bbox
=
self
.
random_apply
(
bbox
)
aug_bbox_flat
=
aug_bbox
.
view
(
seq_len
,
-
1
)
return
aug_bbox_flat
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
class
NoiseSythesis
(
nn
.
Module
):
def
__init__
(
self
,
p
,
scale
=
0.01
,
shift_scale
=
(
8
,
5
),
scaling_size
=
(
0.1
,
0.1
),
canvas_size
=
(
200
,
100
),
bbox_type
=
'sce'
,
poly_coord_dim
=
2
,
bbox_coord_dim
=
2
,
quantify
=
True
):
super
(
NoiseSythesis
,
self
).
__init__
()
self
.
p
=
p
self
.
scale
=
scale
self
.
bbox_type
=
bbox_type
self
.
quantify
=
quantify
self
.
poly_coord_dim
=
poly_coord_dim
self
.
bbox_coord_dim
=
bbox_coord_dim
self
.
transforms
=
[
self
.
random_shifting
,
self
.
random_scaling
]
# self.transforms = [self.random_scaling]
self
.
register_buffer
(
'canvas_size'
,
torch
.
tensor
(
canvas_size
))
self
.
register_buffer
(
'shift_scale'
,
torch
.
tensor
(
shift_scale
).
float
())
self
.
register_buffer
(
'scaling_size'
,
torch
.
tensor
(
scaling_size
))
def
random_scaling
(
self
,
bbox
):
'''
bbox: B, paramter_num, 2
'''
device
=
bbox
.
device
dtype
=
bbox
.
dtype
B
=
bbox
.
shape
[
0
]
noise
=
(
torch
.
rand
(
B
,
device
=
device
)
*
2
-
1
)[:,
None
,
None
]
# [-1,1]
scale
=
self
.
scaling_size
.
to
(
device
)
scale
=
(
noise
*
scale
)
+
1
scaled_bbox
=
bbox
*
scale
# recenterization
coffset
=
scaled_bbox
.
mean
(
-
2
)
-
bbox
.
float
().
mean
(
-
2
)
scaled_bbox
=
scaled_bbox
-
coffset
[:,
None
]
return
scaled_bbox
.
round
().
type
(
dtype
)
def
random_shifting
(
self
,
bbox
):
'''
bbox: B, paramter_num, 2
'''
device
=
bbox
.
device
batch_size
=
bbox
.
shape
[
0
]
shift_scale
=
self
.
shift_scale
scale
=
(
bbox
.
max
(
1
)[
0
]
-
bbox
.
min
(
1
)[
0
])
*
0.1
scale
=
torch
.
where
(
scale
<
shift_scale
,
scale
,
shift_scale
)
noise
=
(
torch
.
rand
(
batch_size
,
2
,
device
=
device
)
*
2
-
1
)
# [-1,1]
offset
=
(
noise
*
scale
).
round
().
type
(
bbox
.
dtype
)
shifted_bbox
=
bbox
+
offset
[:,
None
]
return
shifted_bbox
def
gaussian_noise_bbox
(
self
,
bbox
):
dtype
=
bbox
.
dtype
batch_size
=
bbox
.
shape
[
0
]
scale
=
(
self
.
canvas_size
*
self
.
scale
)[:
self
.
bbox_coord_dim
]
noisy_bbox
=
torch
.
normal
(
bbox
.
type
(
torch
.
float
),
scale
)
if
self
.
quantify
:
noisy_bbox
=
noisy_bbox
.
round
().
type
(
dtype
)
# prevent out of bound case
for
i
in
range
(
self
.
bbox_coord_dim
):
noisy_bbox
[...,
i
]
=
\
torch
.
clamp
(
noisy_bbox
[...,
0
],
1
,
self
.
canvas_size
[
i
])
else
:
noisy_bbox
=
noisy_bbox
.
type
(
torch
.
float
)
return
noisy_bbox
def
gaussian_noise_poly
(
self
,
polyline
,
polyline_mask
):
device
=
polyline
.
device
batchsize
=
polyline
.
shape
[
0
]
scale
=
self
.
canvas_size
*
self
.
scale
polyline
=
F
.
pad
(
polyline
,(
0
,
self
.
poly_coord_dim
-
1
))
polyline
=
polyline
.
view
(
batchsize
,
-
1
,
self
.
poly_coord_dim
)
mask
=
F
.
pad
(
polyline_mask
[:,
1
:],(
0
,
self
.
poly_coord_dim
))
noisy_polyline
=
torch
.
normal
(
polyline
.
type
(
torch
.
float
),
scale
)
if
self
.
quantify
:
noisy_polyline
=
noisy_polyline
.
round
().
type
(
polyline
.
dtype
)
# prevent out of bound case
for
i
in
range
(
self
.
poly_coord_dim
):
noisy_polyline
[...,
i
]
=
\
torch
.
clamp
(
noisy_polyline
[...,
i
],
0
,
self
.
canvas_size
[
i
])
else
:
noisy_polyline
=
noisy_polyline
.
type
(
torch
.
float
)
noisy_polyline
=
noisy_polyline
.
view
(
batchsize
,
-
1
)
*
mask
noisy_polyline
=
noisy_polyline
[:,:
-
(
self
.
poly_coord_dim
-
1
)]
return
noisy_polyline
def
random_apply
(
self
,
bbox
):
for
t
in
self
.
transforms
:
if
self
.
p
<
torch
.
rand
(
1
):
continue
bbox
=
t
(
bbox
)
# prevent out of bound case
bbox
[...,
0
]
=
\
torch
.
clamp
(
bbox
[...,
0
],
0
,
self
.
canvas_size
[
0
])
bbox
[...,
1
]
=
\
torch
.
clamp
(
bbox
[...,
1
],
0
,
self
.
canvas_size
[
1
])
return
bbox
def
simple_aug
(
self
,
batch
):
# augment bbox
if
self
.
bbox_type
in
[
'sce'
,
'xyxy'
]:
fbbox
=
batch
[
'bbox_flat'
]
seq_len
=
fbbox
.
shape
[
0
]
bbox
=
fbbox
.
view
(
seq_len
,
-
1
,
2
)
bbox
=
self
.
gaussian_noise_bbox
(
bbox
)
fbbox_aug
=
bbox
.
view
(
seq_len
,
-
1
)
aug_mask
=
torch
.
rand
(
fbbox
.
shape
,
device
=
fbbox
.
device
)
fbbox
=
torch
.
where
(
aug_mask
<
self
.
p
,
fbbox_aug
,
fbbox
)
elif
self
.
bbox_type
==
'rxyxy'
:
fbbox
=
self
.
rbbox_aug
(
batch
)
elif
self
.
bbox_type
==
'convex_hull'
:
fbbox
=
self
.
convex_hull_aug
(
batch
)
# augment
polyline
=
batch
[
'polylines'
]
polyline_mask
=
batch
[
'polyline_masks'
]
polyline_aug
=
self
.
gaussian_noise_poly
(
polyline
,
polyline_mask
)
aug_mask
=
torch
.
rand
(
polyline
.
shape
,
device
=
polyline
.
device
)
polyline
=
torch
.
where
(
aug_mask
<
self
.
p
,
polyline_aug
,
polyline
)
return
polyline
,
fbbox
def
rbbox_aug
(
self
,
batch
):
return
None
def
convex_hull_aug
(
self
,
batch
):
return
None
def
__call__
(
self
,
batch
,
simple_aug
=
False
):
if
simple_aug
:
return
self
.
simple_aug
(
batch
)
else
:
fbbox
=
batch
[
'bbox_flat'
]
seq_len
=
fbbox
.
shape
[
0
]
bbox
=
fbbox
.
view
(
seq_len
,
-
1
,
self
.
bbox_coord_dim
)
aug_bbox
=
self
.
random_apply
(
bbox
)
aug_bbox_flat
=
aug_bbox
.
view
(
seq_len
,
-
1
)
return
aug_bbox_flat
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/backbones/__init__.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/backbones/__init__.py
View file @
f3b13cad
from
.ipm_backbone
import
IPMEncoder
__all__
=
[
'IPMEncoder'
]
from
.ipm_backbone
import
IPMEncoder
__all__
=
[
'IPMEncoder'
]
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/backbones/internimage.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/backbones/internimage.py
View file @
f3b13cad
File moved
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/backbones/ipm_backbone.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/backbones/ipm_backbone.py
View file @
f3b13cad
import
copy
import
math
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmdet3d.models.builder
import
BACKBONES
from
mmdet.models
import
build_backbone
,
build_neck
class
UpsampleBlock
(
nn
.
Module
):
def
__init__
(
self
,
ins
,
outs
):
super
(
UpsampleBlock
,
self
).
__init__
()
self
.
gn
=
nn
.
GroupNorm
(
32
,
outs
)
self
.
conv
=
nn
.
Conv2d
(
ins
,
outs
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
self
.
relu
=
nn
.
ReLU
(
inplace
=
True
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
x
=
self
.
relu
(
self
.
gn
(
x
))
x
=
self
.
upsample2x
(
x
)
return
x
def
upsample2x
(
self
,
x
):
_
,
_
,
h
,
w
=
x
.
shape
x
=
F
.
interpolate
(
x
,
size
=
(
h
*
2
,
w
*
2
),
mode
=
'bilinear'
,
align_corners
=
True
)
return
x
class
Upsample
(
nn
.
Module
):
def
__init__
(
self
,
zoom_size
=
(
2
,
4
,
8
),
in_channels
=
128
,
out_channels
=
128
,
):
super
(
Upsample
,
self
).
__init__
()
self
.
out_channels
=
out_channels
input_conv
=
UpsampleBlock
(
in_channels
,
out_channels
)
inter_conv
=
UpsampleBlock
(
out_channels
,
out_channels
)
fscale
=
[]
for
scale_factor
in
zoom_size
:
layer_num
=
int
(
math
.
log2
(
scale_factor
))
if
layer_num
<
1
:
fscale
.
append
(
nn
.
Identity
())
continue
tmp
=
[
copy
.
deepcopy
(
input_conv
),
]
tmp
+=
[
copy
.
deepcopy
(
inter_conv
)
for
i
in
range
(
layer_num
-
1
)]
fscale
.
append
(
nn
.
Sequential
(
*
tmp
))
self
.
fscale
=
nn
.
ModuleList
(
fscale
)
def
init_weights
(
self
):
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
nn
.
init
.
kaiming_uniform_
(
m
.
weight
,
a
=
1
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
def
forward
(
self
,
imgs
):
rescale_i
=
[]
for
f
,
img
in
zip
(
self
.
fscale
,
imgs
):
rescale_i
.
append
(
f
(
img
))
out
=
sum
(
rescale_i
)
return
out
@
BACKBONES
.
register_module
()
class
IPMEncoder
(
nn
.
Module
):
'''
encode cam features
'''
def
__init__
(
self
,
img_backbone
,
img_neck
,
upsample
,
xbound
=
[
-
30.0
,
30.0
,
0.5
],
ybound
=
[
-
15.0
,
15.0
,
0.5
],
zbound
=
[
-
10.0
,
10.0
,
20.0
],
heights
=
[
-
1.1
,
0
,
0.5
,
1.1
],
pretrained
=
None
,
out_channels
=
128
,
num_cam
=
6
,
use_lidar
=
False
,
use_image
=
True
,
lidar_dim
=
128
,
):
super
(
IPMEncoder
,
self
).
__init__
()
self
.
x_bound
=
xbound
self
.
y_bound
=
ybound
self
.
heights
=
heights
self
.
num_cam
=
num_cam
num_x
=
int
((
xbound
[
1
]
-
xbound
[
0
])
/
xbound
[
2
])
num_y
=
int
((
ybound
[
1
]
-
ybound
[
0
])
/
ybound
[
2
])
self
.
img_backbone
=
build_backbone
(
img_backbone
)
self
.
img_neck
=
build_neck
(
img_neck
)
self
.
upsample
=
Upsample
(
**
upsample
)
self
.
use_image
=
use_image
self
.
use_lidar
=
use_lidar
if
self
.
use_lidar
:
self
.
pp
=
PointPillarEncoder
(
lidar_dim
,
xbound
,
ybound
,
zbound
)
self
.
outconvs
=
\
nn
.
Conv2d
((
self
.
upsample
.
out_channels
+
3
)
*
len
(
heights
),
out_channels
//
2
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
if
self
.
use_image
:
_out_channels
=
out_channels
//
2
else
:
_out_channels
=
out_channels
self
.
outconvs_lidar
=
\
nn
.
Conv2d
(
lidar_dim
,
_out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
else
:
self
.
outconvs
=
\
nn
.
Conv2d
((
self
.
upsample
.
out_channels
+
3
)
*
len
(
heights
),
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
self
.
init_weights
(
pretrained
=
pretrained
)
# bev_plane
bev_planes
=
[
construct_plane_grid
(
xbound
,
ybound
,
h
)
for
h
in
self
.
heights
]
self
.
register_buffer
(
'bev_planes'
,
torch
.
stack
(
bev_planes
),)
# nlvl,bH,bW,2
self
.
masked_embeds
=
nn
.
Embedding
(
len
(
heights
),
out_channels
)
def
init_weights
(
self
,
pretrained
=
None
):
"""Initialize model weights."""
self
.
img_backbone
.
init_weights
()
self
.
img_neck
.
init_weights
()
self
.
upsample
.
init_weights
()
for
p
in
self
.
outconvs
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
if
self
.
use_lidar
:
for
p
in
self
.
outconvs_lidar
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
for
p
in
self
.
pp
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
def
extract_img_feat
(
self
,
imgs
):
'''
Extract image feaftures and sum up into one pic
Args:
imgs: B, n_cam, C, iH, iW
Returns:
img_feat: B * n_cam, C, H, W
'''
B
,
n_cam
,
C
,
iH
,
iW
=
imgs
.
shape
imgs
=
imgs
.
view
(
B
*
n_cam
,
C
,
iH
,
iW
)
img_feats
=
self
.
img_backbone
(
imgs
)
# reduce the channel dim
img_feats
=
self
.
img_neck
(
img_feats
)
# fuse four feature map
img_feat
=
self
.
upsample
(
img_feats
)
return
img_feat
def
forward
(
self
,
imgs
,
img_metas
,
*
args
,
points
=
None
,
**
kwargs
):
'''
Args:
imgs: torch.Tensor of shape [B, N, 3, H, W]
N: number of cams
img_metas:
# N=6, ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
ego2cam: [B, N, 4, 4]
cam_intrinsics: [B, N, 3, 3]
cam2ego_rotations: [B, N, 3, 3]
cam2ego_translations: [B, N, 3]
...
Outs:
bev_feature: torch.Tensor of shape [B, C*nlvl, bH, bW]
'''
if
self
.
use_image
:
self
.
B
=
imgs
.
shape
[
0
]
# Get transform matrix
ego2cam
=
[]
for
img_meta
in
img_metas
:
ego2cam
.
append
(
img_meta
[
'ego2img'
])
img_shape
=
imgs
.
shape
[
-
2
:]
ego2cam
=
np
.
asarray
(
ego2cam
)
# Image backbone
img_feats
=
self
.
extract_img_feat
(
imgs
)
# IPM
bev_feat
,
bev_feat_mask
=
self
.
ipm
(
img_feats
,
ego2cam
,
img_shape
)
# multi level into a same
bev_feat
=
bev_feat
.
flatten
(
1
,
2
)
bev_feat
=
self
.
outconvs
(
bev_feat
)
if
self
.
use_lidar
:
lidar_feat
=
self
.
get_lidar_feature
(
points
)
if
self
.
use_image
:
bev_feat
=
torch
.
cat
([
bev_feat
,
lidar_feat
],
dim
=
1
)
else
:
bev_feat
=
lidar_feat
return
bev_feat
def
ipm
(
self
,
cam_feat
,
ego2cam
,
img_shape
):
'''
inverse project
Args:
cam_feat: B*ncam, C, cH, cW
img_shape: tuple(H, W)
Returns:
project_feat: B, C, nlvl, bH, bW
bev_feat_mask: B, 1, nlvl, bH, bW
'''
C
=
cam_feat
.
shape
[
1
]
bev_grid
=
self
.
bev_planes
.
unsqueeze
(
0
).
repeat
(
self
.
B
,
1
,
1
,
1
,
1
)
nlvl
,
bH
,
bW
=
bev_grid
.
shape
[
1
:
4
]
bev_grid
=
bev_grid
.
flatten
(
1
,
3
)
# B, nlvl*W*H, 3
# Find points in cam coords
# bev_grid_pos: B*ncam, nlvl*bH*bW, 2
bev_grid_pos
,
bev_cam_mask
=
get_campos
(
bev_grid
,
ego2cam
,
img_shape
)
# B*cam, nlvl*bH, bW, 2
bev_grid_pos
=
bev_grid_pos
.
unflatten
(
-
2
,
(
nlvl
*
bH
,
bW
))
# project feat from 2D to bev plane
projected_feature
=
F
.
grid_sample
(
cam_feat
,
bev_grid_pos
,
align_corners
=
False
).
view
(
self
.
B
,
-
1
,
C
,
nlvl
,
bH
,
bW
)
# B,cam,C,nlvl,bH,bW
# B,cam,nlvl,bH,bW
bev_feat_mask
=
bev_cam_mask
.
unflatten
(
-
1
,
(
nlvl
,
bH
,
bW
))
# eliminate the ncam
# The bev feature is the sum of the 6 cameras
bev_feat_mask
=
bev_feat_mask
.
unsqueeze
(
2
)
projected_feature
=
(
projected_feature
*
bev_feat_mask
).
sum
(
1
)
num_feat
=
bev_feat_mask
.
sum
(
1
)
projected_feature
=
projected_feature
/
\
num_feat
.
masked_fill
(
num_feat
==
0
,
1
)
# concatenate a position information
# projected_feature: B, bH, bW, nlvl, C+3
bev_grid
=
bev_grid
.
view
(
self
.
B
,
nlvl
,
bH
,
bW
,
3
).
permute
(
0
,
4
,
1
,
2
,
3
)
projected_feature
=
torch
.
cat
(
(
projected_feature
,
bev_grid
),
dim
=
1
)
return
projected_feature
,
bev_feat_mask
.
sum
(
1
)
>
0
def
get_lidar_feature
(
self
,
points
):
ptensor
,
pmask
=
points
lidar_feature
=
self
.
pp
(
ptensor
,
pmask
)
# bev_grid = self.bev_planes[...,:-1].unsqueeze(0).repeat(self.B, 1, 1, 1, 1)
# bev_grid = bev_grid[:,0]
# bev_grid = bev_grid.permute(0, 3, 1, 2)
# lidar_feature = torch.cat(
# (lidar_feature, bev_grid), dim=1)
lidar_feature
=
self
.
outconvs_lidar
(
lidar_feature
)
return
lidar_feature
def
construct_plane_grid
(
xbound
,
ybound
,
height
:
float
,
dtype
=
torch
.
float32
):
'''
Returns:
plane: H, W, 3
'''
xmin
,
xmax
=
xbound
[
0
],
xbound
[
1
]
num_x
=
int
((
xbound
[
1
]
-
xbound
[
0
])
/
xbound
[
2
])
ymin
,
ymax
=
ybound
[
0
],
ybound
[
1
]
num_y
=
int
((
ybound
[
1
]
-
ybound
[
0
])
/
ybound
[
2
])
x
=
torch
.
linspace
(
xmin
,
xmax
,
num_x
,
dtype
=
dtype
)
y
=
torch
.
linspace
(
ymin
,
ymax
,
num_y
,
dtype
=
dtype
)
# [num_y, num_x]
y
,
x
=
torch
.
meshgrid
(
y
,
x
)
z
=
torch
.
ones_like
(
x
)
*
height
# [num_y, num_x, 3]
plane
=
torch
.
stack
([
x
,
y
,
z
],
dim
=-
1
)
return
plane
def
get_campos
(
reference_points
,
ego2cam
,
img_shape
):
'''
Find the each refence point's corresponding pixel in each camera
Args:
reference_points: [B, num_query, 3]
ego2cam: (B, num_cam, 4, 4)
Outs:
reference_points_cam: (B*num_cam, num_query, 2)
mask: (B, num_cam, num_query)
num_query == W*H
'''
ego2cam
=
reference_points
.
new_tensor
(
ego2cam
)
# (B, N, 4, 4)
reference_points
=
reference_points
.
clone
()
B
,
num_query
=
reference_points
.
shape
[:
2
]
num_cam
=
ego2cam
.
shape
[
1
]
# reference_points (B, num_queries, 4)
reference_points
=
torch
.
cat
(
(
reference_points
,
torch
.
ones_like
(
reference_points
[...,
:
1
])),
-
1
)
reference_points
=
reference_points
.
view
(
B
,
1
,
num_query
,
4
).
repeat
(
1
,
num_cam
,
1
,
1
).
unsqueeze
(
-
1
)
ego2cam
=
ego2cam
.
view
(
B
,
num_cam
,
1
,
4
,
4
).
repeat
(
1
,
1
,
num_query
,
1
,
1
)
# reference_points_cam (B, num_cam, num_queries, 4)
reference_points_cam
=
(
ego2cam
@
reference_points
).
squeeze
(
-
1
)
eps
=
1e-9
mask
=
(
reference_points_cam
[...,
2
:
3
]
>
eps
)
reference_points_cam
=
\
reference_points_cam
[...,
0
:
2
]
/
\
reference_points_cam
[...,
2
:
3
]
+
eps
reference_points_cam
[...,
0
]
/=
img_shape
[
1
]
reference_points_cam
[...,
1
]
/=
img_shape
[
0
]
# from 0~1 to -1~1
reference_points_cam
=
(
reference_points_cam
-
0.5
)
*
2
mask
=
(
mask
&
(
reference_points_cam
[...,
0
:
1
]
>
-
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
<
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
>
-
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
<
1.0
))
# (B, num_cam, num_query)
mask
=
mask
.
view
(
B
,
num_cam
,
num_query
)
reference_points_cam
=
reference_points_cam
.
view
(
B
*
num_cam
,
num_query
,
2
)
return
reference_points_cam
,
mask
def
_test
():
pass
if
__name__
==
'__main__'
:
_test
()
import
copy
import
math
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmdet3d.models.builder
import
BACKBONES
from
mmdet.models
import
build_backbone
,
build_neck
class
UpsampleBlock
(
nn
.
Module
):
def
__init__
(
self
,
ins
,
outs
):
super
(
UpsampleBlock
,
self
).
__init__
()
self
.
gn
=
nn
.
GroupNorm
(
32
,
outs
)
self
.
conv
=
nn
.
Conv2d
(
ins
,
outs
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
self
.
relu
=
nn
.
ReLU
(
inplace
=
True
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
x
=
self
.
relu
(
self
.
gn
(
x
))
x
=
self
.
upsample2x
(
x
)
return
x
def
upsample2x
(
self
,
x
):
_
,
_
,
h
,
w
=
x
.
shape
x
=
F
.
interpolate
(
x
,
size
=
(
h
*
2
,
w
*
2
),
mode
=
'bilinear'
,
align_corners
=
True
)
return
x
class
Upsample
(
nn
.
Module
):
def
__init__
(
self
,
zoom_size
=
(
2
,
4
,
8
),
in_channels
=
128
,
out_channels
=
128
,
):
super
(
Upsample
,
self
).
__init__
()
self
.
out_channels
=
out_channels
input_conv
=
UpsampleBlock
(
in_channels
,
out_channels
)
inter_conv
=
UpsampleBlock
(
out_channels
,
out_channels
)
fscale
=
[]
for
scale_factor
in
zoom_size
:
layer_num
=
int
(
math
.
log2
(
scale_factor
))
if
layer_num
<
1
:
fscale
.
append
(
nn
.
Identity
())
continue
tmp
=
[
copy
.
deepcopy
(
input_conv
),
]
tmp
+=
[
copy
.
deepcopy
(
inter_conv
)
for
i
in
range
(
layer_num
-
1
)]
fscale
.
append
(
nn
.
Sequential
(
*
tmp
))
self
.
fscale
=
nn
.
ModuleList
(
fscale
)
def
init_weights
(
self
):
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
nn
.
init
.
kaiming_uniform_
(
m
.
weight
,
a
=
1
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
def
forward
(
self
,
imgs
):
rescale_i
=
[]
for
f
,
img
in
zip
(
self
.
fscale
,
imgs
):
rescale_i
.
append
(
f
(
img
))
out
=
sum
(
rescale_i
)
return
out
@
BACKBONES
.
register_module
()
class
IPMEncoder
(
nn
.
Module
):
'''
encode cam features
'''
def
__init__
(
self
,
img_backbone
,
img_neck
,
upsample
,
xbound
=
[
-
30.0
,
30.0
,
0.5
],
ybound
=
[
-
15.0
,
15.0
,
0.5
],
zbound
=
[
-
10.0
,
10.0
,
20.0
],
heights
=
[
-
1.1
,
0
,
0.5
,
1.1
],
pretrained
=
None
,
out_channels
=
128
,
num_cam
=
6
,
use_lidar
=
False
,
use_image
=
True
,
lidar_dim
=
128
,
):
super
(
IPMEncoder
,
self
).
__init__
()
self
.
x_bound
=
xbound
self
.
y_bound
=
ybound
self
.
heights
=
heights
self
.
num_cam
=
num_cam
num_x
=
int
((
xbound
[
1
]
-
xbound
[
0
])
/
xbound
[
2
])
num_y
=
int
((
ybound
[
1
]
-
ybound
[
0
])
/
ybound
[
2
])
self
.
img_backbone
=
build_backbone
(
img_backbone
)
self
.
img_neck
=
build_neck
(
img_neck
)
self
.
upsample
=
Upsample
(
**
upsample
)
self
.
use_image
=
use_image
self
.
use_lidar
=
use_lidar
if
self
.
use_lidar
:
self
.
pp
=
PointPillarEncoder
(
lidar_dim
,
xbound
,
ybound
,
zbound
)
self
.
outconvs
=
\
nn
.
Conv2d
((
self
.
upsample
.
out_channels
+
3
)
*
len
(
heights
),
out_channels
//
2
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
if
self
.
use_image
:
_out_channels
=
out_channels
//
2
else
:
_out_channels
=
out_channels
self
.
outconvs_lidar
=
\
nn
.
Conv2d
(
lidar_dim
,
_out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
else
:
self
.
outconvs
=
\
nn
.
Conv2d
((
self
.
upsample
.
out_channels
+
3
)
*
len
(
heights
),
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
self
.
init_weights
(
pretrained
=
pretrained
)
# bev_plane
bev_planes
=
[
construct_plane_grid
(
xbound
,
ybound
,
h
)
for
h
in
self
.
heights
]
self
.
register_buffer
(
'bev_planes'
,
torch
.
stack
(
bev_planes
),)
# nlvl,bH,bW,2
self
.
masked_embeds
=
nn
.
Embedding
(
len
(
heights
),
out_channels
)
def
init_weights
(
self
,
pretrained
=
None
):
"""Initialize model weights."""
self
.
img_backbone
.
init_weights
()
self
.
img_neck
.
init_weights
()
self
.
upsample
.
init_weights
()
for
p
in
self
.
outconvs
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
if
self
.
use_lidar
:
for
p
in
self
.
outconvs_lidar
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
for
p
in
self
.
pp
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
def
extract_img_feat
(
self
,
imgs
):
'''
Extract image feaftures and sum up into one pic
Args:
imgs: B, n_cam, C, iH, iW
Returns:
img_feat: B * n_cam, C, H, W
'''
B
,
n_cam
,
C
,
iH
,
iW
=
imgs
.
shape
imgs
=
imgs
.
view
(
B
*
n_cam
,
C
,
iH
,
iW
)
img_feats
=
self
.
img_backbone
(
imgs
)
# reduce the channel dim
img_feats
=
self
.
img_neck
(
img_feats
)
# fuse four feature map
img_feat
=
self
.
upsample
(
img_feats
)
return
img_feat
def
forward
(
self
,
imgs
,
img_metas
,
*
args
,
points
=
None
,
**
kwargs
):
'''
Args:
imgs: torch.Tensor of shape [B, N, 3, H, W]
N: number of cams
img_metas:
# N=6, ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
ego2cam: [B, N, 4, 4]
cam_intrinsics: [B, N, 3, 3]
cam2ego_rotations: [B, N, 3, 3]
cam2ego_translations: [B, N, 3]
...
Outs:
bev_feature: torch.Tensor of shape [B, C*nlvl, bH, bW]
'''
if
self
.
use_image
:
self
.
B
=
imgs
.
shape
[
0
]
# Get transform matrix
ego2cam
=
[]
for
img_meta
in
img_metas
:
ego2cam
.
append
(
img_meta
[
'ego2img'
])
img_shape
=
imgs
.
shape
[
-
2
:]
ego2cam
=
np
.
asarray
(
ego2cam
)
# Image backbone
img_feats
=
self
.
extract_img_feat
(
imgs
)
# IPM
bev_feat
,
bev_feat_mask
=
self
.
ipm
(
img_feats
,
ego2cam
,
img_shape
)
# multi level into a same
bev_feat
=
bev_feat
.
flatten
(
1
,
2
)
bev_feat
=
self
.
outconvs
(
bev_feat
)
if
self
.
use_lidar
:
lidar_feat
=
self
.
get_lidar_feature
(
points
)
if
self
.
use_image
:
bev_feat
=
torch
.
cat
([
bev_feat
,
lidar_feat
],
dim
=
1
)
else
:
bev_feat
=
lidar_feat
return
bev_feat
def
ipm
(
self
,
cam_feat
,
ego2cam
,
img_shape
):
'''
inverse project
Args:
cam_feat: B*ncam, C, cH, cW
img_shape: tuple(H, W)
Returns:
project_feat: B, C, nlvl, bH, bW
bev_feat_mask: B, 1, nlvl, bH, bW
'''
C
=
cam_feat
.
shape
[
1
]
bev_grid
=
self
.
bev_planes
.
unsqueeze
(
0
).
repeat
(
self
.
B
,
1
,
1
,
1
,
1
)
nlvl
,
bH
,
bW
=
bev_grid
.
shape
[
1
:
4
]
bev_grid
=
bev_grid
.
flatten
(
1
,
3
)
# B, nlvl*W*H, 3
# Find points in cam coords
# bev_grid_pos: B*ncam, nlvl*bH*bW, 2
bev_grid_pos
,
bev_cam_mask
=
get_campos
(
bev_grid
,
ego2cam
,
img_shape
)
# B*cam, nlvl*bH, bW, 2
bev_grid_pos
=
bev_grid_pos
.
unflatten
(
-
2
,
(
nlvl
*
bH
,
bW
))
# project feat from 2D to bev plane
projected_feature
=
F
.
grid_sample
(
cam_feat
,
bev_grid_pos
,
align_corners
=
False
).
view
(
self
.
B
,
-
1
,
C
,
nlvl
,
bH
,
bW
)
# B,cam,C,nlvl,bH,bW
# B,cam,nlvl,bH,bW
bev_feat_mask
=
bev_cam_mask
.
unflatten
(
-
1
,
(
nlvl
,
bH
,
bW
))
# eliminate the ncam
# The bev feature is the sum of the 6 cameras
bev_feat_mask
=
bev_feat_mask
.
unsqueeze
(
2
)
projected_feature
=
(
projected_feature
*
bev_feat_mask
).
sum
(
1
)
num_feat
=
bev_feat_mask
.
sum
(
1
)
projected_feature
=
projected_feature
/
\
num_feat
.
masked_fill
(
num_feat
==
0
,
1
)
# concatenate a position information
# projected_feature: B, bH, bW, nlvl, C+3
bev_grid
=
bev_grid
.
view
(
self
.
B
,
nlvl
,
bH
,
bW
,
3
).
permute
(
0
,
4
,
1
,
2
,
3
)
projected_feature
=
torch
.
cat
(
(
projected_feature
,
bev_grid
),
dim
=
1
)
return
projected_feature
,
bev_feat_mask
.
sum
(
1
)
>
0
def
get_lidar_feature
(
self
,
points
):
ptensor
,
pmask
=
points
lidar_feature
=
self
.
pp
(
ptensor
,
pmask
)
# bev_grid = self.bev_planes[...,:-1].unsqueeze(0).repeat(self.B, 1, 1, 1, 1)
# bev_grid = bev_grid[:,0]
# bev_grid = bev_grid.permute(0, 3, 1, 2)
# lidar_feature = torch.cat(
# (lidar_feature, bev_grid), dim=1)
lidar_feature
=
self
.
outconvs_lidar
(
lidar_feature
)
return
lidar_feature
def
construct_plane_grid
(
xbound
,
ybound
,
height
:
float
,
dtype
=
torch
.
float32
):
'''
Returns:
plane: H, W, 3
'''
xmin
,
xmax
=
xbound
[
0
],
xbound
[
1
]
num_x
=
int
((
xbound
[
1
]
-
xbound
[
0
])
/
xbound
[
2
])
ymin
,
ymax
=
ybound
[
0
],
ybound
[
1
]
num_y
=
int
((
ybound
[
1
]
-
ybound
[
0
])
/
ybound
[
2
])
x
=
torch
.
linspace
(
xmin
,
xmax
,
num_x
,
dtype
=
dtype
)
y
=
torch
.
linspace
(
ymin
,
ymax
,
num_y
,
dtype
=
dtype
)
# [num_y, num_x]
y
,
x
=
torch
.
meshgrid
(
y
,
x
)
z
=
torch
.
ones_like
(
x
)
*
height
# [num_y, num_x, 3]
plane
=
torch
.
stack
([
x
,
y
,
z
],
dim
=-
1
)
return
plane
def
get_campos
(
reference_points
,
ego2cam
,
img_shape
):
'''
Find the each refence point's corresponding pixel in each camera
Args:
reference_points: [B, num_query, 3]
ego2cam: (B, num_cam, 4, 4)
Outs:
reference_points_cam: (B*num_cam, num_query, 2)
mask: (B, num_cam, num_query)
num_query == W*H
'''
ego2cam
=
reference_points
.
new_tensor
(
ego2cam
)
# (B, N, 4, 4)
reference_points
=
reference_points
.
clone
()
B
,
num_query
=
reference_points
.
shape
[:
2
]
num_cam
=
ego2cam
.
shape
[
1
]
# reference_points (B, num_queries, 4)
reference_points
=
torch
.
cat
(
(
reference_points
,
torch
.
ones_like
(
reference_points
[...,
:
1
])),
-
1
)
reference_points
=
reference_points
.
view
(
B
,
1
,
num_query
,
4
).
repeat
(
1
,
num_cam
,
1
,
1
).
unsqueeze
(
-
1
)
ego2cam
=
ego2cam
.
view
(
B
,
num_cam
,
1
,
4
,
4
).
repeat
(
1
,
1
,
num_query
,
1
,
1
)
# reference_points_cam (B, num_cam, num_queries, 4)
reference_points_cam
=
(
ego2cam
@
reference_points
).
squeeze
(
-
1
)
eps
=
1e-9
mask
=
(
reference_points_cam
[...,
2
:
3
]
>
eps
)
reference_points_cam
=
\
reference_points_cam
[...,
0
:
2
]
/
\
reference_points_cam
[...,
2
:
3
]
+
eps
reference_points_cam
[...,
0
]
/=
img_shape
[
1
]
reference_points_cam
[...,
1
]
/=
img_shape
[
0
]
# from 0~1 to -1~1
reference_points_cam
=
(
reference_points_cam
-
0.5
)
*
2
mask
=
(
mask
&
(
reference_points_cam
[...,
0
:
1
]
>
-
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
<
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
>
-
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
<
1.0
))
# (B, num_cam, num_query)
mask
=
mask
.
view
(
B
,
num_cam
,
num_query
)
reference_points_cam
=
reference_points_cam
.
view
(
B
*
num_cam
,
num_query
,
2
)
return
reference_points_cam
,
mask
def
_test
():
pass
if
__name__
==
'__main__'
:
_test
()
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/__init__.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/__init__.py
View file @
f3b13cad
from
.base_map_head
import
BaseMapHead
from
.dg_head
import
DGHead
from
.map_element_detector
import
MapElementDetector
from
.base_map_head
import
BaseMapHead
from
.dg_head
import
DGHead
from
.map_element_detector
import
MapElementDetector
from
.polyline_generator
import
PolylineGenerator
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/base_map_head.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/base_map_head.py
View file @
f3b13cad
from
abc
import
ABCMeta
,
abstractmethod
import
torch.nn
as
nn
from
mmcv.runner
import
auto_fp16
from
mmcv.utils
import
print_log
from
mmdet.utils
import
get_root_logger
class
BaseMapHead
(
nn
.
Module
,
metaclass
=
ABCMeta
):
"""Base class for mappers."""
def
__init__
(
self
):
super
(
BaseMapHead
,
self
).
__init__
()
self
.
fp16_enabled
=
False
def
init_weights
(
self
,
pretrained
=
None
):
"""Initialize the weights in detector.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if
pretrained
is
not
None
:
logger
=
get_root_logger
()
print_log
(
f
'load model from:
{
pretrained
}
'
,
logger
=
logger
)
@
auto_fp16
(
apply_to
=
(
'img'
,
))
def
forward
(
self
,
*
args
,
**
kwargs
):
pass
@
abstractmethod
def
loss
(
self
,
pred
,
gt
):
'''
Compute loss
Output:
dict(
loss: torch.Tensor
log_vars: dict(
str: float,
)
num_samples: int
)
'''
return
@
abstractmethod
def
post_process
(
self
,
pred
):
'''
convert model predictions to vectorized outputs
the output format should be consistent with the evaluation function
'''
return
from
abc
import
ABCMeta
,
abstractmethod
import
torch.nn
as
nn
from
mmcv.runner
import
auto_fp16
from
mmcv.utils
import
print_log
from
mmdet.utils
import
get_root_logger
class
BaseMapHead
(
nn
.
Module
,
metaclass
=
ABCMeta
):
"""Base class for mappers."""
def
__init__
(
self
):
super
(
BaseMapHead
,
self
).
__init__
()
self
.
fp16_enabled
=
False
def
init_weights
(
self
,
pretrained
=
None
):
"""Initialize the weights in detector.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if
pretrained
is
not
None
:
logger
=
get_root_logger
()
print_log
(
f
'load model from:
{
pretrained
}
'
,
logger
=
logger
)
@
auto_fp16
(
apply_to
=
(
'img'
,
))
def
forward
(
self
,
*
args
,
**
kwargs
):
pass
@
abstractmethod
def
loss
(
self
,
pred
,
gt
):
'''
Compute loss
Output:
dict(
loss: torch.Tensor
log_vars: dict(
str: float,
)
num_samples: int
)
'''
return
@
abstractmethod
def
post_process
(
self
,
pred
):
'''
convert model predictions to vectorized outputs
the output format should be consistent with the evaluation function
'''
return
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/detgen_utils/__init__.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detgen_utils/__init__.py
View file @
f3b13cad
File moved
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/detgen_utils/causal_trans.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detgen_utils/causal_trans.py
View file @
f3b13cad
# the causal layer is credited by the https://github.com/alexmt-scale/causal-transformer-decoder
# we made some change to stick with the polygen.
import
torch
import
torch.nn
as
nn
from
typing
import
Optional
from
torch
import
Tensor
from
mmcv.cnn.bricks.registry
import
ATTENTION
from
mmcv.utils
import
build_from_cfg
def
build_attention
(
cfg
,
default_args
=
None
):
"""Builder for attention."""
return
build_from_cfg
(
cfg
,
ATTENTION
,
default_args
)
class
CausalTransformerDecoder
(
nn
.
TransformerDecoder
):
"""Implementation of a transformer decoder based on torch implementation but
more efficient. The difference is that it doesn't need to recompute the
embeddings of all the past decoded tokens but instead uses a cache to
store them. This makes use of the fact that the attention of a decoder is
causal, so new predicted tokens don't affect the old tokens' embedding bc
the corresponding attention cells are masked.
The complexity goes from seq_len^3 to seq_len^2.
This only happens in eval mode.
In training mode, teacher forcing makes these optimizations unnecessary. Hence the
Decoder acts like a regular nn.TransformerDecoder (except that the attention tgt
masks are handled for you).
"""
def
forward
(
self
,
tgt
:
Tensor
,
memory
:
Optional
[
Tensor
]
=
None
,
cache
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
causal_mask
:
Optional
[
Tensor
]
=
None
,
)
->
Tensor
:
"""
Args:
tgt (Tensor): current_len_output x bsz x hidden_dim
memory (Tensor): len_encoded_seq x bsz x hidden_dim
cache (Optional[Tensor]):
n_layers x (current_len_output - 1) x bsz x hidden_dim
If current_len_output == 1, nothing is cached yet, so cache
should be None. Same if the module is in training mode.
others (Optional[Tensor]): see official documentations
Returns:
output (Tensor): current_len_output x bsz x hidden_dim
cache (Optional[Tensor]): n_layers x current_len_output x bsz x hidden_dim
Only returns it when module is in eval mode (no caching in training)
"""
output
=
tgt
if
self
.
training
:
if
cache
is
not
None
:
raise
ValueError
(
"cache parameter should be None in training mode"
)
for
mod
in
self
.
layers
:
output
=
mod
(
output
,
memory
,
memory_mask
=
memory_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
causal_mask
=
causal_mask
,
only_last
=
False
,
)
return
output
,
cache
else
:
new_token_cache
=
[]
for
i
,
mod
in
enumerate
(
self
.
layers
):
output
=
mod
(
output
,
memory
,
memory_mask
=
memory_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
causal_mask
=
causal_mask
,
only_last
=
True
if
cache
is
not
None
else
False
)
new_token_cache
.
append
(
output
)
# use the pre_calculated intermediate parameters.
if
cache
is
not
None
:
output
=
torch
.
cat
([
cache
[
i
],
output
],
dim
=
0
)
if
cache
is
not
None
:
new_cache
=
torch
.
cat
(
[
cache
,
torch
.
stack
(
new_token_cache
,
dim
=
0
)],
dim
=
1
)
else
:
new_cache
=
torch
.
stack
(
new_token_cache
,
dim
=
0
)
return
output
,
new_cache
class
CausalTransformerDecoderLayer
(
nn
.
TransformerDecoderLayer
):
def
__init__
(
self
,
*
args
,
re_zero
=
True
,
norm_first
=
True
,
map_attn_cfg
=
None
,
**
kwargs
):
'''
Args:
re_zero: If True, alpha scale residuals with zero init.
'''
super
(
CausalTransformerDecoderLayer
,
self
).
__init__
(
*
args
,
**
kwargs
)
if
re_zero
:
self
.
res_weight1
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
self
.
res_weight2
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
self
.
res_weight3
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
else
:
self
.
res_weight1
=
1.
self
.
res_weight2
=
1.
self
.
res_weight3
=
1.
self
.
norm_first
=
norm_first
self
.
map_attn
=
None
if
map_attn_cfg
is
not
None
:
self
.
map_attn
=
build_attention
(
map_attn_cfg
)
def
forward
(
self
,
tgt
:
Tensor
,
memory
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
causal_mask
:
Optional
[
Tensor
]
=
None
,
query
:
Optional
[
Tensor
]
=
None
,
only_last
=
False
)
->
Tensor
:
"""
Args:
see CausalTransformerDecoder
query is not None model will perform query stream
Returns:
Tensor:
If training: embedding of the whole layer: seq_len x bsz x hidden_dim
If eval mode: embedding of last token: 1 x bsz x hidden_dim
"""
if
not
self
.
norm_first
:
raise
ValueError
(
"norm_first parameter should be True!"
)
if
self
.
training
:
# the official Pytorch implementation
x
=
tgt
if
query
is
not
None
:
x
=
query
x
=
x
+
self
.
res_weight1
*
\
self
.
_sa_block
(
self
.
norm1
(
x
),
self
.
norm1
(
tgt
),
causal_mask
,
tgt_key_padding_mask
)
if
memory
is
not
None
:
x
=
x
+
self
.
res_weight2
*
\
self
.
_mha_block
(
self
.
norm2
(
x
),
memory
,
memory_mask
,
memory_key_padding_mask
)
x
=
x
+
self
.
res_weight3
*
self
.
_ff_block
(
self
.
norm3
(
x
))
return
x
# This part is adapted from the official Pytorch implementation
# So that only the last token gets modified and returned.
# we follow the pre-LN trans in https://arxiv.org/pdf/2002.04745v1.pdf .
x
=
tgt
if
query
is
not
None
:
x
=
query
if
only_last
:
x
=
x
[
-
1
:]
if
causal_mask
is
not
None
:
attn_mask
=
causal_mask
if
only_last
:
attn_mask
=
attn_mask
[
-
1
:]
# XXX
else
:
attn_mask
=
None
# efficient self attention
x
=
x
+
self
.
res_weight1
*
\
self
.
_sa_block
(
self
.
norm1
(
x
),
self
.
norm1
(
tgt
),
attn_mask
,
tgt_key_padding_mask
)
# encoder-decoder attention
if
memory
is
not
None
:
x
=
x
+
self
.
res_weight2
*
\
self
.
_mha_block
(
self
.
norm2
(
x
),
memory
,
memory_mask
,
memory_key_padding_mask
)
# final feed-forward network
x
=
x
+
self
.
res_weight3
*
self
.
_ff_block
(
self
.
norm3
(
x
))
return
x
# self-attention block
def
_sa_block
(
self
,
x
:
Tensor
,
mem
:
Tensor
,
attn_mask
:
Optional
[
Tensor
],
key_padding_mask
:
Optional
[
Tensor
])
->
Tensor
:
x
=
self
.
self_attn
(
x
,
mem
,
mem
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
,
need_weights
=
False
)[
0
]
return
self
.
dropout1
(
x
)
# multihead attention block
def
_mha_block
(
self
,
x
:
Tensor
,
mem
:
Tensor
,
attn_mask
:
Optional
[
Tensor
],
key_padding_mask
:
Optional
[
Tensor
])
->
Tensor
:
x
=
self
.
multihead_attn
(
x
,
mem
,
mem
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
,
need_weights
=
False
)[
0
]
return
self
.
dropout2
(
x
)
# feed forward block
def
_ff_block
(
self
,
x
:
Tensor
)
->
Tensor
:
x
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
x
))))
return
self
.
dropout3
(
x
)
class
PolygenTransformerEncoderLayer
(
nn
.
TransformerEncoderLayer
):
def
__init__
(
self
,
*
args
,
re_zero
=
True
,
norm_first
=
True
,
**
kwargs
):
'''
Args:
re_zero: If True, alpha scale residuals with zero init.
'''
super
(
PolygenTransformerEncoderLayer
,
self
).
__init__
(
*
args
,
**
kwargs
)
if
re_zero
:
self
.
res_weight1
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
self
.
res_weight2
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
else
:
self
.
res_weight1
=
1.
self
.
res_weight2
=
1.
self
.
norm_first
=
norm_first
def
forward
(
self
,
src
:
Tensor
,
src_mask
:
Optional
[
Tensor
]
=
None
,
src_key_padding_mask
:
Optional
[
Tensor
]
=
None
)
->
Tensor
:
r
"""Pass the input through the encoder layer.
Args:
src: the sequence to the encoder layer (required).
src_mask: the mask for the src sequence (optional).
src_key_padding_mask: the mask for the src keys per batch (optional).
Shape:
see the docs in Transformer class.
"""
# see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
x
=
src
if
self
.
norm_first
:
x
=
x
+
self
.
res_weight1
*
self
.
_sa_block
(
self
.
norm1
(
x
),
src_mask
,
src_key_padding_mask
)
x
=
x
+
self
.
res_weight2
*
self
.
_ff_block
(
self
.
norm2
(
x
))
else
:
x
=
self
.
norm1
(
x
+
self
.
res_weight1
*
self
.
_sa_block
(
x
,
src_mask
,
src_key_padding_mask
))
x
=
self
.
norm2
(
x
+
self
.
res_weight2
*
self
.
_ff_block
(
x
))
return
x
# self-attention block
def
_sa_block
(
self
,
x
:
Tensor
,
attn_mask
:
Optional
[
Tensor
],
key_padding_mask
:
Optional
[
Tensor
])
->
Tensor
:
x
=
self
.
self_attn
(
x
,
x
,
x
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
,
need_weights
=
False
)[
0
]
return
self
.
dropout1
(
x
)
# feed forward block
def
_ff_block
(
self
,
x
:
Tensor
)
->
Tensor
:
x
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
x
))))
return
self
.
dropout2
(
x
)
def
generate_square_subsequent_mask
(
sz
:
int
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
""" Generate the attention mask for causal decoding """
mask
=
(
torch
.
triu
(
torch
.
ones
(
sz
,
sz
))
==
1
).
transpose
(
0
,
1
)
mask
=
(
mask
.
float
()
.
masked_fill
(
mask
==
0
,
float
(
"-inf"
))
.
masked_fill
(
mask
==
1
,
float
(
0.0
))
).
to
(
device
=
device
)
# the causal layer is credited by the https://github.com/alexmt-scale/causal-transformer-decoder
# we made some change to stick with the polygen.
import
torch
import
torch.nn
as
nn
from
typing
import
Optional
from
torch
import
Tensor
from
mmcv.cnn.bricks.registry
import
ATTENTION
from
mmcv.utils
import
build_from_cfg
def
build_attention
(
cfg
,
default_args
=
None
):
"""Builder for attention."""
return
build_from_cfg
(
cfg
,
ATTENTION
,
default_args
)
class
CausalTransformerDecoder
(
nn
.
TransformerDecoder
):
"""Implementation of a transformer decoder based on torch implementation but
more efficient. The difference is that it doesn't need to recompute the
embeddings of all the past decoded tokens but instead uses a cache to
store them. This makes use of the fact that the attention of a decoder is
causal, so new predicted tokens don't affect the old tokens' embedding bc
the corresponding attention cells are masked.
The complexity goes from seq_len^3 to seq_len^2.
This only happens in eval mode.
In training mode, teacher forcing makes these optimizations unnecessary. Hence the
Decoder acts like a regular nn.TransformerDecoder (except that the attention tgt
masks are handled for you).
"""
def
forward
(
self
,
tgt
:
Tensor
,
memory
:
Optional
[
Tensor
]
=
None
,
cache
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
causal_mask
:
Optional
[
Tensor
]
=
None
,
)
->
Tensor
:
"""
Args:
tgt (Tensor): current_len_output x bsz x hidden_dim
memory (Tensor): len_encoded_seq x bsz x hidden_dim
cache (Optional[Tensor]):
n_layers x (current_len_output - 1) x bsz x hidden_dim
If current_len_output == 1, nothing is cached yet, so cache
should be None. Same if the module is in training mode.
others (Optional[Tensor]): see official documentations
Returns:
output (Tensor): current_len_output x bsz x hidden_dim
cache (Optional[Tensor]): n_layers x current_len_output x bsz x hidden_dim
Only returns it when module is in eval mode (no caching in training)
"""
output
=
tgt
if
self
.
training
:
if
cache
is
not
None
:
raise
ValueError
(
"cache parameter should be None in training mode"
)
for
mod
in
self
.
layers
:
output
=
mod
(
output
,
memory
,
memory_mask
=
memory_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
causal_mask
=
causal_mask
,
only_last
=
False
,
)
return
output
,
cache
else
:
new_token_cache
=
[]
for
i
,
mod
in
enumerate
(
self
.
layers
):
output
=
mod
(
output
,
memory
,
memory_mask
=
memory_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
causal_mask
=
causal_mask
,
only_last
=
True
if
cache
is
not
None
else
False
)
new_token_cache
.
append
(
output
)
# use the pre_calculated intermediate parameters.
if
cache
is
not
None
:
output
=
torch
.
cat
([
cache
[
i
],
output
],
dim
=
0
)
if
cache
is
not
None
:
new_cache
=
torch
.
cat
(
[
cache
,
torch
.
stack
(
new_token_cache
,
dim
=
0
)],
dim
=
1
)
else
:
new_cache
=
torch
.
stack
(
new_token_cache
,
dim
=
0
)
return
output
,
new_cache
class
CausalTransformerDecoderLayer
(
nn
.
TransformerDecoderLayer
):
def
__init__
(
self
,
*
args
,
re_zero
=
True
,
norm_first
=
True
,
map_attn_cfg
=
None
,
**
kwargs
):
'''
Args:
re_zero: If True, alpha scale residuals with zero init.
'''
super
(
CausalTransformerDecoderLayer
,
self
).
__init__
(
*
args
,
**
kwargs
)
if
re_zero
:
self
.
res_weight1
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
self
.
res_weight2
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
self
.
res_weight3
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
else
:
self
.
res_weight1
=
1.
self
.
res_weight2
=
1.
self
.
res_weight3
=
1.
self
.
norm_first
=
norm_first
self
.
map_attn
=
None
if
map_attn_cfg
is
not
None
:
self
.
map_attn
=
build_attention
(
map_attn_cfg
)
def
forward
(
self
,
tgt
:
Tensor
,
memory
:
Optional
[
Tensor
]
=
None
,
memory_mask
:
Optional
[
Tensor
]
=
None
,
tgt_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
memory_key_padding_mask
:
Optional
[
Tensor
]
=
None
,
causal_mask
:
Optional
[
Tensor
]
=
None
,
query
:
Optional
[
Tensor
]
=
None
,
only_last
=
False
)
->
Tensor
:
"""
Args:
see CausalTransformerDecoder
query is not None model will perform query stream
Returns:
Tensor:
If training: embedding of the whole layer: seq_len x bsz x hidden_dim
If eval mode: embedding of last token: 1 x bsz x hidden_dim
"""
if
not
self
.
norm_first
:
raise
ValueError
(
"norm_first parameter should be True!"
)
if
self
.
training
:
# the official Pytorch implementation
x
=
tgt
if
query
is
not
None
:
x
=
query
x
=
x
+
self
.
res_weight1
*
\
self
.
_sa_block
(
self
.
norm1
(
x
),
self
.
norm1
(
tgt
),
causal_mask
,
tgt_key_padding_mask
)
if
memory
is
not
None
:
x
=
x
+
self
.
res_weight2
*
\
self
.
_mha_block
(
self
.
norm2
(
x
),
memory
,
memory_mask
,
memory_key_padding_mask
)
x
=
x
+
self
.
res_weight3
*
self
.
_ff_block
(
self
.
norm3
(
x
))
return
x
# This part is adapted from the official Pytorch implementation
# So that only the last token gets modified and returned.
# we follow the pre-LN trans in https://arxiv.org/pdf/2002.04745v1.pdf .
x
=
tgt
if
query
is
not
None
:
x
=
query
if
only_last
:
x
=
x
[
-
1
:]
if
causal_mask
is
not
None
:
attn_mask
=
causal_mask
if
only_last
:
attn_mask
=
attn_mask
[
-
1
:]
# XXX
else
:
attn_mask
=
None
# efficient self attention
x
=
x
+
self
.
res_weight1
*
\
self
.
_sa_block
(
self
.
norm1
(
x
),
self
.
norm1
(
tgt
),
attn_mask
,
tgt_key_padding_mask
)
# encoder-decoder attention
if
memory
is
not
None
:
x
=
x
+
self
.
res_weight2
*
\
self
.
_mha_block
(
self
.
norm2
(
x
),
memory
,
memory_mask
,
memory_key_padding_mask
)
# final feed-forward network
x
=
x
+
self
.
res_weight3
*
self
.
_ff_block
(
self
.
norm3
(
x
))
return
x
# self-attention block
def
_sa_block
(
self
,
x
:
Tensor
,
mem
:
Tensor
,
attn_mask
:
Optional
[
Tensor
],
key_padding_mask
:
Optional
[
Tensor
])
->
Tensor
:
x
=
self
.
self_attn
(
x
,
mem
,
mem
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
,
need_weights
=
False
)[
0
]
return
self
.
dropout1
(
x
)
# multihead attention block
def
_mha_block
(
self
,
x
:
Tensor
,
mem
:
Tensor
,
attn_mask
:
Optional
[
Tensor
],
key_padding_mask
:
Optional
[
Tensor
])
->
Tensor
:
x
=
self
.
multihead_attn
(
x
,
mem
,
mem
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
,
need_weights
=
False
)[
0
]
return
self
.
dropout2
(
x
)
# feed forward block
def
_ff_block
(
self
,
x
:
Tensor
)
->
Tensor
:
x
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
x
))))
return
self
.
dropout3
(
x
)
class
PolygenTransformerEncoderLayer
(
nn
.
TransformerEncoderLayer
):
def
__init__
(
self
,
*
args
,
re_zero
=
True
,
norm_first
=
True
,
**
kwargs
):
'''
Args:
re_zero: If True, alpha scale residuals with zero init.
'''
super
(
PolygenTransformerEncoderLayer
,
self
).
__init__
(
*
args
,
**
kwargs
)
if
re_zero
:
self
.
res_weight1
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
self
.
res_weight2
=
nn
.
Parameter
(
torch
.
FloatTensor
([
0
,
]))
else
:
self
.
res_weight1
=
1.
self
.
res_weight2
=
1.
self
.
norm_first
=
norm_first
def
forward
(
self
,
src
:
Tensor
,
src_mask
:
Optional
[
Tensor
]
=
None
,
src_key_padding_mask
:
Optional
[
Tensor
]
=
None
)
->
Tensor
:
r
"""Pass the input through the encoder layer.
Args:
src: the sequence to the encoder layer (required).
src_mask: the mask for the src sequence (optional).
src_key_padding_mask: the mask for the src keys per batch (optional).
Shape:
see the docs in Transformer class.
"""
# see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
x
=
src
if
self
.
norm_first
:
x
=
x
+
self
.
res_weight1
*
self
.
_sa_block
(
self
.
norm1
(
x
),
src_mask
,
src_key_padding_mask
)
x
=
x
+
self
.
res_weight2
*
self
.
_ff_block
(
self
.
norm2
(
x
))
else
:
x
=
self
.
norm1
(
x
+
self
.
res_weight1
*
self
.
_sa_block
(
x
,
src_mask
,
src_key_padding_mask
))
x
=
self
.
norm2
(
x
+
self
.
res_weight2
*
self
.
_ff_block
(
x
))
return
x
# self-attention block
def
_sa_block
(
self
,
x
:
Tensor
,
attn_mask
:
Optional
[
Tensor
],
key_padding_mask
:
Optional
[
Tensor
])
->
Tensor
:
x
=
self
.
self_attn
(
x
,
x
,
x
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
,
need_weights
=
False
)[
0
]
return
self
.
dropout1
(
x
)
# feed forward block
def
_ff_block
(
self
,
x
:
Tensor
)
->
Tensor
:
x
=
self
.
linear2
(
self
.
dropout
(
self
.
activation
(
self
.
linear1
(
x
))))
return
self
.
dropout2
(
x
)
def
generate_square_subsequent_mask
(
sz
:
int
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
""" Generate the attention mask for causal decoding """
mask
=
(
torch
.
triu
(
torch
.
ones
(
sz
,
sz
))
==
1
).
transpose
(
0
,
1
)
mask
=
(
mask
.
float
()
.
masked_fill
(
mask
==
0
,
float
(
"-inf"
))
.
masked_fill
(
mask
==
1
,
float
(
0.0
))
).
to
(
device
=
device
)
return
mask
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/detgen_utils/utils.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detgen_utils/utils.py
View file @
f3b13cad
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
def
generate_square_subsequent_mask
(
sz
:
int
,
condition_len
:
int
=
1
,
bool_out
=
False
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
""" Generate the attention mask for causal decoding """
mask
=
(
torch
.
triu
(
torch
.
ones
(
sz
,
sz
))
==
1
).
transpose
(
0
,
1
)
if
condition_len
>
1
:
mask
[:
condition_len
,:
condition_len
]
=
1
if
not
bool_out
:
mask
=
(
mask
.
float
()
.
masked_fill
(
mask
==
0
,
float
(
"-inf"
))
.
masked_fill
(
mask
==
1
,
float
(
0.0
)))
return
mask
.
to
(
device
=
device
)
def
dequantize_verts
(
verts
,
canvas_size
:
Tensor
,
add_noise
=
False
):
"""Quantizes vertices and outputs integers with specified n_bits."""
min_range
=
-
1
max_range
=
1
range_quantize
=
canvas_size
verts
=
verts
.
type
(
torch
.
float32
)
verts
=
verts
*
(
max_range
-
min_range
)
/
range_quantize
+
min_range
if
add_noise
:
verts
+=
torch
.
rand_like
(
verts
)
*
range_quantize
return
verts
def
quantize_verts
(
verts
,
canvas_size
:
Tensor
):
"""Convert vertices from its original range ([-1,1]) to discrete values in [0, n_bits**2 - 1].
Args:
verts: seqlen, 2
"""
min_range
=
-
1
max_range
=
1
range_quantize
=
canvas_size
-
1
verts_ratio
=
(
verts
-
min_range
)
/
(
max_range
-
min_range
)
verts_quantize
=
verts_ratio
*
range_quantize
return
verts_quantize
.
type
(
torch
.
int32
)
def
top_k_logits
(
logits
,
k
):
"""Masks logits such that logits not in top-k are small."""
if
k
==
0
:
return
logits
else
:
values
,
_
=
torch
.
topk
(
logits
,
k
=
k
)
k_largest
=
torch
.
min
(
values
)
logits
=
torch
.
where
(
logits
<
k_largest
,
torch
.
ones_like
(
logits
)
*-
1e9
,
logits
)
return
logits
def
top_p_logits
(
logits
,
p
):
"""Masks logits using nucleus (top-p) sampling."""
if
p
==
1
:
return
logits
else
:
seq
,
dim
=
logits
.
shape
[
1
:]
logits
=
logits
.
view
(
-
1
,
dim
)
sort_indices
=
torch
.
argsort
(
logits
,
dim
=-
1
,
descending
=
True
)
probs
=
F
.
softmax
(
logits
,
dim
=-
1
).
gather
(
-
1
,
sort_indices
)
cumprobs
=
torch
.
cumsum
(
probs
,
dim
=-
1
)
-
probs
# The top 1 candidate always will not be masked.
# This way ensures at least 1 indices will be selected.
sort_mask
=
(
cumprobs
>
p
).
type
(
logits
.
dtype
)
batch_indices
=
torch
.
repeat_interleave
(
torch
.
arange
(
logits
.
shape
[
0
]).
unsqueeze
(
-
1
),
dim
,
dim
=-
1
)
top_p_mask
=
torch
.
zeros_like
(
logits
)
top_p_mask
=
top_p_mask
.
scatter_add
(
-
1
,
sort_indices
,
sort_mask
)
logits
-=
top_p_mask
*
1e9
return
logits
.
view
(
-
1
,
seq
,
dim
)
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
def
generate_square_subsequent_mask
(
sz
:
int
,
condition_len
:
int
=
1
,
bool_out
=
False
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
""" Generate the attention mask for causal decoding """
mask
=
(
torch
.
triu
(
torch
.
ones
(
sz
,
sz
))
==
1
).
transpose
(
0
,
1
)
if
condition_len
>
1
:
mask
[:
condition_len
,:
condition_len
]
=
1
if
not
bool_out
:
mask
=
(
mask
.
float
()
.
masked_fill
(
mask
==
0
,
float
(
"-inf"
))
.
masked_fill
(
mask
==
1
,
float
(
0.0
)))
return
mask
.
to
(
device
=
device
)
def
dequantize_verts
(
verts
,
canvas_size
:
Tensor
,
add_noise
=
False
):
"""Quantizes vertices and outputs integers with specified n_bits."""
min_range
=
-
1
max_range
=
1
range_quantize
=
canvas_size
verts
=
verts
.
type
(
torch
.
float32
)
verts
=
verts
*
(
max_range
-
min_range
)
/
range_quantize
+
min_range
if
add_noise
:
verts
+=
torch
.
rand_like
(
verts
)
*
range_quantize
return
verts
def
quantize_verts
(
verts
,
canvas_size
:
Tensor
):
"""Convert vertices from its original range ([-1,1]) to discrete values in [0, n_bits**2 - 1].
Args:
verts: seqlen, 2
"""
min_range
=
-
1
max_range
=
1
range_quantize
=
canvas_size
-
1
verts_ratio
=
(
verts
-
min_range
)
/
(
max_range
-
min_range
)
verts_quantize
=
verts_ratio
*
range_quantize
return
verts_quantize
.
type
(
torch
.
int32
)
def
top_k_logits
(
logits
,
k
):
"""Masks logits such that logits not in top-k are small."""
if
k
==
0
:
return
logits
else
:
values
,
_
=
torch
.
topk
(
logits
,
k
=
k
)
k_largest
=
torch
.
min
(
values
)
logits
=
torch
.
where
(
logits
<
k_largest
,
torch
.
ones_like
(
logits
)
*-
1e9
,
logits
)
return
logits
def
top_p_logits
(
logits
,
p
):
"""Masks logits using nucleus (top-p) sampling."""
if
p
==
1
:
return
logits
else
:
seq
,
dim
=
logits
.
shape
[
1
:]
logits
=
logits
.
view
(
-
1
,
dim
)
sort_indices
=
torch
.
argsort
(
logits
,
dim
=-
1
,
descending
=
True
)
probs
=
F
.
softmax
(
logits
,
dim
=-
1
).
gather
(
-
1
,
sort_indices
)
cumprobs
=
torch
.
cumsum
(
probs
,
dim
=-
1
)
-
probs
# The top 1 candidate always will not be masked.
# This way ensures at least 1 indices will be selected.
sort_mask
=
(
cumprobs
>
p
).
type
(
logits
.
dtype
)
batch_indices
=
torch
.
repeat_interleave
(
torch
.
arange
(
logits
.
shape
[
0
]).
unsqueeze
(
-
1
),
dim
,
dim
=-
1
)
top_p_mask
=
torch
.
zeros_like
(
logits
)
top_p_mask
=
top_p_mask
.
scatter_add
(
-
1
,
sort_indices
,
sort_mask
)
logits
-=
top_p_mask
*
1e9
return
logits
.
view
(
-
1
,
seq
,
dim
)
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/detr_bbox.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detr_bbox.py
View file @
f3b13cad
import
copy
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Conv2d
,
Linear
from
mmcv.runner
import
force_fp32
from
torch.distributions.categorical
import
Categorical
from
mmdet.core
import
multi_apply
,
reduce_mean
from
mmdet.models
import
HEADS
from
.detr_head
import
DETRMapFixedNumHead
@
HEADS
.
register_module
(
force
=
True
)
class
DETRBboxHead
(
DETRMapFixedNumHead
):
def
__init__
(
self
,
*
args
,
canvas_size
=
(
400
,
200
),
discrete_output
=
True
,
separate_detect
=
True
,
mode
=
'xyxy'
,
bbox_size
=
None
,
coord_dim
=
2
,
kp_coord_dim
=
2
,
**
kwargs
):
self
.
canvas_size
=
canvas_size
# hard code
self
.
separate_detect
=
separate_detect
self
.
discrete_output
=
discrete_output
self
.
bbox_size
=
3
if
mode
==
'sce'
else
2
if
bbox_size
is
not
None
:
self
.
bbox_size
=
bbox_size
self
.
coord_dim
=
coord_dim
# for xyz
self
.
kp_coord_dim
=
kp_coord_dim
super
(
DETRBboxHead
,
self
).
__init__
(
*
args
,
**
kwargs
)
del
self
.
canvas_size
self
.
register_buffer
(
'canvas_size'
,
torch
.
tensor
(
canvas_size
))
self
.
_init_embedding
()
def
_init_embedding
(
self
):
# for bbox parameter xstart, ystart, xend, yend
self
.
bbox_embedding
=
nn
.
Embedding
(
4
,
self
.
embed_dims
)
self
.
label_embed
=
nn
.
Embedding
(
self
.
num_classes
,
self
.
embed_dims
)
self
.
img_coord_embed
=
nn
.
Linear
(
2
,
self
.
embed_dims
)
def
_init_branch
(
self
,):
"""Initialize classification branch and regression branch of head."""
# add sigmoid or not
if
self
.
separate_detect
:
if
self
.
cls_out_channels
==
self
.
num_classes
+
1
:
self
.
cls_out_channels
=
2
else
:
self
.
cls_out_channels
=
1
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
if
self
.
discrete_output
:
reg_branch
.
append
(
nn
.
Linear
(
self
.
embed_dims
,
max
(
self
.
canvas_size
),
bias
=
True
,))
else
:
reg_branch
.
append
(
nn
.
Linear
(
self
.
embed_dims
,
self
.
bbox_size
*
self
.
coord_dim
,
bias
=
True
,))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
num_pred
=
self
.
transformer
.
decoder
.
num_layers
if
self
.
iterative
:
fc_cls
=
_get_clones
(
fc_cls
,
num_pred
)
reg_branch
=
_get_clones
(
reg_branch
,
num_pred
)
self
.
pre_branches
=
nn
.
ModuleDict
([
(
'cls'
,
fc_cls
),
(
'reg'
,
reg_branch
),
])
def
_prepare_context
(
self
,
batch
,
context
):
"""Prepare class label and vertex context."""
global_context_embedding
=
None
if
self
.
separate_detect
:
global_context_embedding
=
self
.
label_embed
(
batch
[
'class_label'
])
# Image context
if
self
.
separate_detect
:
image_embeddings
=
assign_bev
(
context
[
'bev_embeddings'
],
batch
[
'batch_idx'
])
else
:
image_embeddings
=
context
[
'bev_embeddings'
]
image_embeddings
=
self
.
input_proj
(
image_embeddings
)
# only change feature size
# Pass images through encoder
device
=
image_embeddings
.
device
# Add 2D coordinate grid embedding
B
,
C
,
H
,
W
=
image_embeddings
.
shape
Ws
=
torch
.
linspace
(
-
1.
,
1.
,
W
)
Hs
=
torch
.
linspace
(
-
1.
,
1.
,
H
)
image_coords
=
torch
.
stack
(
torch
.
meshgrid
(
Hs
,
Ws
),
dim
=-
1
).
to
(
device
)
image_coord_embeddings
=
self
.
img_coord_embed
(
image_coords
)
image_embeddings
+=
image_coord_embeddings
[
None
].
permute
(
0
,
3
,
1
,
2
)
# Reshape spatial grid to sequence
sequential_context_embeddings
=
image_embeddings
.
reshape
(
B
,
C
,
H
,
W
)
return
(
global_context_embedding
,
sequential_context_embeddings
)
def
forward
(
self
,
batch
,
context
,
img_metas
=
None
):
'''
Args:
bev_feature (List[Tensor]): shape [B, C, H, W]
feature in bev view
img_metas
Outs:
preds_dict (Dict):
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
'''
(
global_context_embedding
,
sequential_context_embeddings
)
=
\
self
.
_prepare_context
(
batch
,
context
)
if
self
.
separate_detect
:
query_embedding
=
self
.
query_embedding
.
weight
[
None
]
+
\
global_context_embedding
[:,
None
]
else
:
B
=
sequential_context_embeddings
.
shape
[
0
]
query_embedding
=
self
.
query_embedding
.
weight
[
None
].
repeat
(
B
,
1
,
1
)
x
=
sequential_context_embeddings
B
,
C
,
H
,
W
=
x
.
shape
masks
=
x
.
new_zeros
((
B
,
H
,
W
))
pos_embed
=
self
.
positional_encoding
(
masks
)
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
.
type
(
torch
.
bool
),
query_embedding
,
pos_embed
)
outputs
=
[]
for
i
,
query_feat
in
enumerate
(
outs_dec
):
outputs
.
append
(
self
.
get_prediction
(
query_feat
))
return
outputs
def
get_prediction
(
self
,
query_feat
):
ocls
=
self
.
pre_branches
[
'cls'
](
query_feat
)
if
self
.
discrete_output
:
pos
=
[]
for
i
in
range
(
4
):
pos_embeds
=
self
.
bbox_embedding
.
weight
[
i
]
_pos
=
self
.
pre_branches
[
'reg'
](
query_feat
+
pos_embeds
)
pos
.
append
(
_pos
)
# # y mask
# _vert_mask = torch.arange(logits.shape[-1], device=logits.device)
# vertices_mask_y = (_vert_mask < self.canvas_size[1]+1)
# logits[:,1::2] = logits[:,1::2]*vertices_mask_y - ~vertices_mask_y*1e9
logits
=
torch
.
stack
(
pos
,
dim
=-
2
)
/
1.
lines
=
Categorical
(
logits
=
logits
)
else
:
lines
=
self
.
pre_branches
[
'reg'
](
query_feat
).
sigmoid
()
lines
=
lines
.
unflatten
(
-
1
,
(
self
.
bbox_size
,
self
.
coord_dim
))
*
self
.
canvas_size
lines
=
lines
.
flatten
(
-
2
)
return
dict
(
lines
=
lines
,
# [bs, num_query, 4, num_canvas_size]
scores
=
ocls
,
# [bs, num_query, num_class]
)
@
force_fp32
(
apply_to
=
(
'score_pred'
,
'lines_pred'
,
'gt_lines'
))
def
_get_target_single
(
self
,
score_pred
,
lines_pred
,
gt_labels
,
gt_lines
,
gt_bboxes_ignore
=
None
):
"""
Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
lines_pred (Tensor):
shape [num_query, num_points, 2].
gt_lines (Tensor):
shape [num_gt, num_points, 2].
gt_labels (torch.LongTensor)
shape [num_gt, ]
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (LongTensor): Labels of each image.
shape [num_query, 1]
- label_weights (Tensor]): Label weights of each image.
shape [num_query, 1]
- lines_target (Tensor): Lines targets of each image.
shape [num_query, num_points, 2]
- lines_weights (Tensor): Lines weights of each image.
shape [num_query, num_points, 2]
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_pred_lines
=
len
(
lines_pred
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
preds
=
dict
(
lines
=
lines_pred
,
scores
=
score_pred
,),
gts
=
dict
(
lines
=
gt_lines
,
labels
=
gt_labels
,
),
gt_bboxes_ignore
=
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
lines_pred
,
gt_lines
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
pos_gt_inds
=
sampling_result
.
pos_assigned_gt_inds
# label targets 0: foreground, 1: background
if
self
.
separate_detect
:
labels
=
gt_lines
.
new_full
((
num_pred_lines
,
),
1
,
dtype
=
torch
.
long
)
else
:
labels
=
gt_lines
.
new_full
(
(
num_pred_lines
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_lines
.
new_ones
(
num_pred_lines
)
# bbox targets since lines_pred's last dimension is the vocabulary
# and ground truth dose not have this dimension.
if
self
.
discrete_output
:
lines_target
=
torch
.
zeros_like
(
lines_pred
[...,
0
]).
long
()
lines_weights
=
torch
.
zeros_like
(
lines_pred
[...,
0
])
else
:
lines_target
=
torch
.
zeros_like
(
lines_pred
)
lines_weights
=
torch
.
zeros_like
(
lines_pred
)
lines_target
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
.
type
(
lines_target
.
dtype
)
lines_weights
[
pos_inds
]
=
1.0
n
=
lines_weights
.
sum
(
-
1
,
keepdim
=
True
)
lines_weights
=
lines_weights
/
n
.
masked_fill
(
n
==
0
,
1
)
return
(
labels
,
label_weights
,
lines_target
,
lines_weights
,
pos_inds
,
neg_inds
,
pos_gt_inds
)
# @force_fp32(apply_to=('preds', 'gts'))
def
get_targets
(
self
,
preds
,
gts
,
gt_bboxes_ignore_list
=
None
):
"""
Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
lines_preds_list (list[Tensor]): [num_query, num_points, 2].
gt_lines_list (list[Tensor]): Ground truth lines for each image
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all \
images.
- lines_targets_list (list[Tensor]): Lines targets for all \
images.
- lines_weight_list (list[Tensor]): Lines weights for all \
images.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
# format the inputs
if
self
.
separate_detect
:
bbox
=
[
b
[
m
]
for
b
,
m
in
zip
(
gts
[
'bbox'
],
gts
[
'bbox_mask'
])]
class_label
=
torch
.
zeros_like
(
gts
[
'bbox_mask'
]).
long
()
class_label
=
[
b
[
m
]
for
b
,
m
in
zip
(
class_label
,
gts
[
'bbox_mask'
])]
else
:
class_label
=
gts
[
'class_label'
]
bbox
=
gts
[
'bbox'
]
if
self
.
discrete_output
:
lines_pred
=
preds
[
'lines'
].
logits
else
:
lines_pred
=
preds
[
'lines'
]
bbox
=
[
b
.
float
()
for
b
in
bbox
]
(
labels_list
,
label_weights_list
,
lines_targets_list
,
lines_weights_list
,
pos_inds_list
,
neg_inds_list
,
pos_gt_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
preds
[
'scores'
],
lines_pred
,
class_label
,
bbox
,
gt_bboxes_ignore
=
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
new_gts
=
dict
(
labels
=
labels_list
,
label_weights
=
label_weights_list
,
bboxs
=
lines_targets_list
,
bboxs_weights
=
lines_weights_list
,
)
return
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
,
pos_gt_inds_list
# @force_fp32(apply_to=('preds', 'gts'))
def
loss_single
(
self
,
preds
:
dict
,
gts
:
dict
,
gt_bboxes_ignore_list
=
None
,
reduction
=
'none'
):
"""
Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
lines_preds (Tensor):
shape [bs, num_query, num_points, 2].
gt_lines_list (list[Tensor]):
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
# Get target for each sample
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
,
pos_gt_inds_list
=
\
self
.
get_targets
(
preds
,
gts
,
gt_bboxes_ignore_list
)
# Batched all data
for
k
,
v
in
new_gts
.
items
():
new_gts
[
k
]
=
torch
.
stack
(
v
,
dim
=
0
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
preds
[
'scores'
].
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
# Classification loss
if
self
.
separate_detect
:
loss_cls
=
self
.
bce_loss
(
preds
[
'scores'
],
new_gts
[
'labels'
],
new_gts
[
'label_weights'
],
cls_avg_factor
)
else
:
# since the inputs needs the second dim is the class dim, we permute the prediction.
cls_scores
=
preds
[
'scores'
].
reshape
(
-
1
,
self
.
cls_out_channels
)
cls_labels
=
new_gts
[
'labels'
].
reshape
(
-
1
)
cls_weights
=
new_gts
[
'label_weights'
].
reshape
(
-
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
cls_labels
,
cls_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# position NLL loss
if
self
.
discrete_output
:
loss_reg
=
-
(
preds
[
'lines'
].
log_prob
(
new_gts
[
'bboxs'
])
*
new_gts
[
'bboxs_weights'
]).
sum
()
/
(
num_total_pos
)
else
:
loss_reg
=
self
.
reg_loss
(
preds
[
'lines'
],
new_gts
[
'bboxs'
],
new_gts
[
'bboxs_weights'
],
avg_factor
=
num_total_pos
)
loss_dict
=
dict
(
cls
=
loss_cls
,
reg
=
loss_reg
,
)
return
loss_dict
,
pos_inds_list
,
pos_gt_inds_list
def
bce_loss
(
self
,
logits
,
label
,
weights
,
cls_avg_factor
):
''' binary ce plog(p) + (1-p)log(1-p)
logits: B,n,1
label:
'''
p
=
logits
.
squeeze
(
-
1
).
sigmoid
()
pos_msk
=
label
==
0
neg_msk
=
~
pos_msk
loss_cls
=
-
(
p
.
log
()
*
pos_msk
+
(
1
-
p
).
log
()
*
neg_msk
)
loss_cls
=
(
loss_cls
*
weights
).
sum
()
/
cls_avg_factor
return
loss_cls
def
post_process
(
self
,
preds_dicts
:
list
,
**
kwargs
):
'''
Args:
preds_dicts:
scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
lines (Tensor):
[nb_dec, bs, num_query, bbox parameters(4)].
Outs:
ret_list (List[Dict]) with length as bs
list of result dict for each sample in the batch
XXX
'''
preds
=
preds_dicts
[
-
1
]
batched_cls_scores
=
preds
[
'scores'
]
batched_lines_preds
=
preds
[
'lines'
]
batch_size
=
batched_cls_scores
.
size
(
0
)
device
=
batched_cls_scores
.
device
result_dict
=
{
'bbox'
:
[],
'scores'
:
[],
'labels'
:
[],
'bbox_flat'
:
[],
'lines_cls'
:
[],
'lines_bs_idx'
:
[],
}
for
i
in
range
(
batch_size
):
cls_scores
=
batched_cls_scores
[
i
]
det_preds
=
batched_lines_preds
[
i
]
max_num
=
self
.
max_lines
if
self
.
loss_cls
.
use_sigmoid
:
cls_scores
=
cls_scores
.
sigmoid
()
scores
,
valid_idx
=
cls_scores
.
view
(
-
1
).
topk
(
max_num
)
det_labels
=
valid_idx
%
self
.
num_classes
valid_idx
=
valid_idx
//
self
.
num_classes
det_preds
=
det_preds
[
valid_idx
]
else
:
scores
,
det_labels
=
F
.
softmax
(
cls_scores
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
scores
,
valid_idx
=
scores
.
topk
(
max_num
)
det_preds
=
det_preds
[
valid_idx
]
det_labels
=
det_labels
[
valid_idx
]
nline
=
len
(
valid_idx
)
result_dict
[
'bbox'
].
append
(
det_preds
)
result_dict
[
'scores'
].
append
(
scores
)
result_dict
[
'labels'
].
append
(
det_labels
)
result_dict
[
'lines_bs_idx'
].
extend
([
i
]
*
nline
)
# for down stream polyline
_bboxs
=
torch
.
cat
(
result_dict
[
'bbox'
],
dim
=
0
)
# quantize the data
result_dict
[
'bbox_flat'
]
=
torch
.
round
(
_bboxs
).
type
(
torch
.
int32
)
result_dict
[
'lines_cls'
]
=
torch
.
cat
(
result_dict
[
'labels'
],
dim
=
0
).
long
()
result_dict
[
'lines_bs_idx'
]
=
torch
.
tensor
(
result_dict
[
'lines_bs_idx'
],
device
=
device
).
long
()
return
result_dict
def
assign_bev
(
feat
,
idx
):
import
copy
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Conv2d
,
Linear
from
mmcv.runner
import
force_fp32
from
torch.distributions.categorical
import
Categorical
from
mmdet.core
import
multi_apply
,
reduce_mean
from
mmdet.models
import
HEADS
from
.detr_head
import
DETRMapFixedNumHead
@
HEADS
.
register_module
(
force
=
True
)
class
DETRBboxHead
(
DETRMapFixedNumHead
):
def
__init__
(
self
,
*
args
,
canvas_size
=
(
400
,
200
),
discrete_output
=
True
,
separate_detect
=
True
,
mode
=
'xyxy'
,
bbox_size
=
None
,
coord_dim
=
2
,
kp_coord_dim
=
2
,
**
kwargs
):
self
.
canvas_size
=
canvas_size
# hard code
self
.
separate_detect
=
separate_detect
self
.
discrete_output
=
discrete_output
self
.
bbox_size
=
3
if
mode
==
'sce'
else
2
if
bbox_size
is
not
None
:
self
.
bbox_size
=
bbox_size
self
.
coord_dim
=
coord_dim
# for xyz
self
.
kp_coord_dim
=
kp_coord_dim
super
(
DETRBboxHead
,
self
).
__init__
(
*
args
,
**
kwargs
)
del
self
.
canvas_size
self
.
register_buffer
(
'canvas_size'
,
torch
.
tensor
(
canvas_size
))
self
.
_init_embedding
()
def
_init_embedding
(
self
):
# for bbox parameter xstart, ystart, xend, yend
self
.
bbox_embedding
=
nn
.
Embedding
(
4
,
self
.
embed_dims
)
self
.
label_embed
=
nn
.
Embedding
(
self
.
num_classes
,
self
.
embed_dims
)
self
.
img_coord_embed
=
nn
.
Linear
(
2
,
self
.
embed_dims
)
def
_init_branch
(
self
,):
"""Initialize classification branch and regression branch of head."""
# add sigmoid or not
if
self
.
separate_detect
:
if
self
.
cls_out_channels
==
self
.
num_classes
+
1
:
self
.
cls_out_channels
=
2
else
:
self
.
cls_out_channels
=
1
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
if
self
.
discrete_output
:
reg_branch
.
append
(
nn
.
Linear
(
self
.
embed_dims
,
max
(
self
.
canvas_size
),
bias
=
True
,))
else
:
reg_branch
.
append
(
nn
.
Linear
(
self
.
embed_dims
,
self
.
bbox_size
*
self
.
coord_dim
,
bias
=
True
,))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
num_pred
=
self
.
transformer
.
decoder
.
num_layers
if
self
.
iterative
:
fc_cls
=
_get_clones
(
fc_cls
,
num_pred
)
reg_branch
=
_get_clones
(
reg_branch
,
num_pred
)
self
.
pre_branches
=
nn
.
ModuleDict
([
(
'cls'
,
fc_cls
),
(
'reg'
,
reg_branch
),
])
def
_prepare_context
(
self
,
batch
,
context
):
"""Prepare class label and vertex context."""
global_context_embedding
=
None
if
self
.
separate_detect
:
global_context_embedding
=
self
.
label_embed
(
batch
[
'class_label'
])
# Image context
if
self
.
separate_detect
:
image_embeddings
=
assign_bev
(
context
[
'bev_embeddings'
],
batch
[
'batch_idx'
])
else
:
image_embeddings
=
context
[
'bev_embeddings'
]
image_embeddings
=
self
.
input_proj
(
image_embeddings
)
# only change feature size
# Pass images through encoder
device
=
image_embeddings
.
device
# Add 2D coordinate grid embedding
B
,
C
,
H
,
W
=
image_embeddings
.
shape
Ws
=
torch
.
linspace
(
-
1.
,
1.
,
W
)
Hs
=
torch
.
linspace
(
-
1.
,
1.
,
H
)
image_coords
=
torch
.
stack
(
torch
.
meshgrid
(
Hs
,
Ws
),
dim
=-
1
).
to
(
device
)
image_coord_embeddings
=
self
.
img_coord_embed
(
image_coords
)
image_embeddings
+=
image_coord_embeddings
[
None
].
permute
(
0
,
3
,
1
,
2
)
# Reshape spatial grid to sequence
sequential_context_embeddings
=
image_embeddings
.
reshape
(
B
,
C
,
H
,
W
)
return
(
global_context_embedding
,
sequential_context_embeddings
)
def
forward
(
self
,
batch
,
context
,
img_metas
=
None
):
'''
Args:
bev_feature (List[Tensor]): shape [B, C, H, W]
feature in bev view
img_metas
Outs:
preds_dict (Dict):
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
'''
(
global_context_embedding
,
sequential_context_embeddings
)
=
\
self
.
_prepare_context
(
batch
,
context
)
if
self
.
separate_detect
:
query_embedding
=
self
.
query_embedding
.
weight
[
None
]
+
\
global_context_embedding
[:,
None
]
else
:
B
=
sequential_context_embeddings
.
shape
[
0
]
query_embedding
=
self
.
query_embedding
.
weight
[
None
].
repeat
(
B
,
1
,
1
)
x
=
sequential_context_embeddings
B
,
C
,
H
,
W
=
x
.
shape
masks
=
x
.
new_zeros
((
B
,
H
,
W
))
pos_embed
=
self
.
positional_encoding
(
masks
)
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
.
type
(
torch
.
bool
),
query_embedding
,
pos_embed
)
outputs
=
[]
for
i
,
query_feat
in
enumerate
(
outs_dec
):
outputs
.
append
(
self
.
get_prediction
(
query_feat
))
return
outputs
def
get_prediction
(
self
,
query_feat
):
ocls
=
self
.
pre_branches
[
'cls'
](
query_feat
)
if
self
.
discrete_output
:
pos
=
[]
for
i
in
range
(
4
):
pos_embeds
=
self
.
bbox_embedding
.
weight
[
i
]
_pos
=
self
.
pre_branches
[
'reg'
](
query_feat
+
pos_embeds
)
pos
.
append
(
_pos
)
# # y mask
# _vert_mask = torch.arange(logits.shape[-1], device=logits.device)
# vertices_mask_y = (_vert_mask < self.canvas_size[1]+1)
# logits[:,1::2] = logits[:,1::2]*vertices_mask_y - ~vertices_mask_y*1e9
logits
=
torch
.
stack
(
pos
,
dim
=-
2
)
/
1.
lines
=
Categorical
(
logits
=
logits
)
else
:
lines
=
self
.
pre_branches
[
'reg'
](
query_feat
).
sigmoid
()
lines
=
lines
.
unflatten
(
-
1
,
(
self
.
bbox_size
,
self
.
coord_dim
))
*
self
.
canvas_size
lines
=
lines
.
flatten
(
-
2
)
return
dict
(
lines
=
lines
,
# [bs, num_query, 4, num_canvas_size]
scores
=
ocls
,
# [bs, num_query, num_class]
)
@
force_fp32
(
apply_to
=
(
'score_pred'
,
'lines_pred'
,
'gt_lines'
))
def
_get_target_single
(
self
,
score_pred
,
lines_pred
,
gt_labels
,
gt_lines
,
gt_bboxes_ignore
=
None
):
"""
Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
lines_pred (Tensor):
shape [num_query, num_points, 2].
gt_lines (Tensor):
shape [num_gt, num_points, 2].
gt_labels (torch.LongTensor)
shape [num_gt, ]
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (LongTensor): Labels of each image.
shape [num_query, 1]
- label_weights (Tensor]): Label weights of each image.
shape [num_query, 1]
- lines_target (Tensor): Lines targets of each image.
shape [num_query, num_points, 2]
- lines_weights (Tensor): Lines weights of each image.
shape [num_query, num_points, 2]
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_pred_lines
=
len
(
lines_pred
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
preds
=
dict
(
lines
=
lines_pred
,
scores
=
score_pred
,),
gts
=
dict
(
lines
=
gt_lines
,
labels
=
gt_labels
,
),
gt_bboxes_ignore
=
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
lines_pred
,
gt_lines
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
pos_gt_inds
=
sampling_result
.
pos_assigned_gt_inds
# label targets 0: foreground, 1: background
if
self
.
separate_detect
:
labels
=
gt_lines
.
new_full
((
num_pred_lines
,
),
1
,
dtype
=
torch
.
long
)
else
:
labels
=
gt_lines
.
new_full
(
(
num_pred_lines
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_lines
.
new_ones
(
num_pred_lines
)
# bbox targets since lines_pred's last dimension is the vocabulary
# and ground truth dose not have this dimension.
if
self
.
discrete_output
:
lines_target
=
torch
.
zeros_like
(
lines_pred
[...,
0
]).
long
()
lines_weights
=
torch
.
zeros_like
(
lines_pred
[...,
0
])
else
:
lines_target
=
torch
.
zeros_like
(
lines_pred
)
lines_weights
=
torch
.
zeros_like
(
lines_pred
)
lines_target
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
.
type
(
lines_target
.
dtype
)
lines_weights
[
pos_inds
]
=
1.0
n
=
lines_weights
.
sum
(
-
1
,
keepdim
=
True
)
lines_weights
=
lines_weights
/
n
.
masked_fill
(
n
==
0
,
1
)
return
(
labels
,
label_weights
,
lines_target
,
lines_weights
,
pos_inds
,
neg_inds
,
pos_gt_inds
)
# @force_fp32(apply_to=('preds', 'gts'))
def
get_targets
(
self
,
preds
,
gts
,
gt_bboxes_ignore_list
=
None
):
"""
Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
lines_preds_list (list[Tensor]): [num_query, num_points, 2].
gt_lines_list (list[Tensor]): Ground truth lines for each image
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- lines_targets_list (list[Tensor]): Lines targets for all
\
images.
- lines_weight_list (list[Tensor]): Lines weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
# format the inputs
if
self
.
separate_detect
:
bbox
=
[
b
[
m
]
for
b
,
m
in
zip
(
gts
[
'bbox'
],
gts
[
'bbox_mask'
])]
class_label
=
torch
.
zeros_like
(
gts
[
'bbox_mask'
]).
long
()
class_label
=
[
b
[
m
]
for
b
,
m
in
zip
(
class_label
,
gts
[
'bbox_mask'
])]
else
:
class_label
=
gts
[
'class_label'
]
bbox
=
gts
[
'bbox'
]
if
self
.
discrete_output
:
lines_pred
=
preds
[
'lines'
].
logits
else
:
lines_pred
=
preds
[
'lines'
]
bbox
=
[
b
.
float
()
for
b
in
bbox
]
(
labels_list
,
label_weights_list
,
lines_targets_list
,
lines_weights_list
,
pos_inds_list
,
neg_inds_list
,
pos_gt_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
preds
[
'scores'
],
lines_pred
,
class_label
,
bbox
,
gt_bboxes_ignore
=
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
new_gts
=
dict
(
labels
=
labels_list
,
label_weights
=
label_weights_list
,
bboxs
=
lines_targets_list
,
bboxs_weights
=
lines_weights_list
,
)
return
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
,
pos_gt_inds_list
# @force_fp32(apply_to=('preds', 'gts'))
def
loss_single
(
self
,
preds
:
dict
,
gts
:
dict
,
gt_bboxes_ignore_list
=
None
,
reduction
=
'none'
):
"""
Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
lines_preds (Tensor):
shape [bs, num_query, num_points, 2].
gt_lines_list (list[Tensor]):
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
# Get target for each sample
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
,
pos_gt_inds_list
=
\
self
.
get_targets
(
preds
,
gts
,
gt_bboxes_ignore_list
)
# Batched all data
for
k
,
v
in
new_gts
.
items
():
new_gts
[
k
]
=
torch
.
stack
(
v
,
dim
=
0
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
preds
[
'scores'
].
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
# Classification loss
if
self
.
separate_detect
:
loss_cls
=
self
.
bce_loss
(
preds
[
'scores'
],
new_gts
[
'labels'
],
new_gts
[
'label_weights'
],
cls_avg_factor
)
else
:
# since the inputs needs the second dim is the class dim, we permute the prediction.
cls_scores
=
preds
[
'scores'
].
reshape
(
-
1
,
self
.
cls_out_channels
)
cls_labels
=
new_gts
[
'labels'
].
reshape
(
-
1
)
cls_weights
=
new_gts
[
'label_weights'
].
reshape
(
-
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
cls_labels
,
cls_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# position NLL loss
if
self
.
discrete_output
:
loss_reg
=
-
(
preds
[
'lines'
].
log_prob
(
new_gts
[
'bboxs'
])
*
new_gts
[
'bboxs_weights'
]).
sum
()
/
(
num_total_pos
)
else
:
loss_reg
=
self
.
reg_loss
(
preds
[
'lines'
],
new_gts
[
'bboxs'
],
new_gts
[
'bboxs_weights'
],
avg_factor
=
num_total_pos
)
loss_dict
=
dict
(
cls
=
loss_cls
,
reg
=
loss_reg
,
)
return
loss_dict
,
pos_inds_list
,
pos_gt_inds_list
def
bce_loss
(
self
,
logits
,
label
,
weights
,
cls_avg_factor
):
''' binary ce plog(p) + (1-p)log(1-p)
logits: B,n,1
label:
'''
p
=
logits
.
squeeze
(
-
1
).
sigmoid
()
pos_msk
=
label
==
0
neg_msk
=
~
pos_msk
loss_cls
=
-
(
p
.
log
()
*
pos_msk
+
(
1
-
p
).
log
()
*
neg_msk
)
loss_cls
=
(
loss_cls
*
weights
).
sum
()
/
cls_avg_factor
return
loss_cls
def
post_process
(
self
,
preds_dicts
:
list
,
**
kwargs
):
'''
Args:
preds_dicts:
scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
lines (Tensor):
[nb_dec, bs, num_query, bbox parameters(4)].
Outs:
ret_list (List[Dict]) with length as bs
list of result dict for each sample in the batch
XXX
'''
preds
=
preds_dicts
[
-
1
]
batched_cls_scores
=
preds
[
'scores'
]
batched_lines_preds
=
preds
[
'lines'
]
batch_size
=
batched_cls_scores
.
size
(
0
)
device
=
batched_cls_scores
.
device
result_dict
=
{
'bbox'
:
[],
'scores'
:
[],
'labels'
:
[],
'bbox_flat'
:
[],
'lines_cls'
:
[],
'lines_bs_idx'
:
[],
}
for
i
in
range
(
batch_size
):
cls_scores
=
batched_cls_scores
[
i
]
det_preds
=
batched_lines_preds
[
i
]
max_num
=
self
.
max_lines
if
self
.
loss_cls
.
use_sigmoid
:
cls_scores
=
cls_scores
.
sigmoid
()
scores
,
valid_idx
=
cls_scores
.
view
(
-
1
).
topk
(
max_num
)
det_labels
=
valid_idx
%
self
.
num_classes
valid_idx
=
valid_idx
//
self
.
num_classes
det_preds
=
det_preds
[
valid_idx
]
else
:
scores
,
det_labels
=
F
.
softmax
(
cls_scores
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
scores
,
valid_idx
=
scores
.
topk
(
max_num
)
det_preds
=
det_preds
[
valid_idx
]
det_labels
=
det_labels
[
valid_idx
]
nline
=
len
(
valid_idx
)
result_dict
[
'bbox'
].
append
(
det_preds
)
result_dict
[
'scores'
].
append
(
scores
)
result_dict
[
'labels'
].
append
(
det_labels
)
result_dict
[
'lines_bs_idx'
].
extend
([
i
]
*
nline
)
# for down stream polyline
_bboxs
=
torch
.
cat
(
result_dict
[
'bbox'
],
dim
=
0
)
# quantize the data
result_dict
[
'bbox_flat'
]
=
torch
.
round
(
_bboxs
).
type
(
torch
.
int32
)
result_dict
[
'lines_cls'
]
=
torch
.
cat
(
result_dict
[
'labels'
],
dim
=
0
).
long
()
result_dict
[
'lines_bs_idx'
]
=
torch
.
tensor
(
result_dict
[
'lines_bs_idx'
],
device
=
device
).
long
()
return
result_dict
def
assign_bev
(
feat
,
idx
):
return
feat
[
idx
]
\ No newline at end of file
autonomous_driving/Online-HD-Map-Construction
-CVPR2023
/src/models/heads/detr_head.py
→
autonomous_driving/Online-HD-Map-Construction/src/models/heads/detr_head.py
View file @
f3b13cad
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
copy
from
mmdet.models
import
HEADS
from
mmcv.cnn
import
Conv2d
from
mmcv.cnn
import
Linear
,
build_activation_layer
,
bias_init_with_prob
from
mmcv.cnn.bricks.transformer
import
build_positional_encoding
from
mmdet.models.utils
import
build_transformer
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
multi_apply
,
build_assigner
,
build_sampler
,
reduce_mean
)
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models
import
build_loss
from
.base_map_head
import
BaseMapHead
@
HEADS
.
register_module
()
class
DETRMapFixedNumHead
(
BaseMapHead
):
def
__init__
(
self
,
num_classes
=
3
,
in_channels
=
128
,
num_query
=
100
,
max_lines
=
50
,
score_thre
=
0.2
,
num_reg_fcs
=
2
,
num_points
=
100
,
iterative
=
False
,
patch_size
=
None
,
sync_cls_avg_factor
=
True
,
transformer
:
dict
=
None
,
positional_encoding
:
dict
=
None
,
loss_cls
:
dict
=
None
,
loss_reg
:
dict
=
None
,
train_cfg
:
dict
=
None
,
init_cfg
=
None
,
**
kwargs
):
super
().
__init__
()
assigner
=
train_cfg
[
'assigner'
]
self
.
assigner
=
build_assigner
(
assigner
)
# DETR sampling=False, so use PseudoSampler
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
train_cfg
=
train_cfg
self
.
max_lines
=
max_lines
self
.
score_thre
=
score_thre
self
.
num_query
=
num_query
self
.
in_channels
=
in_channels
self
.
num_classes
=
num_classes
self
.
num_points
=
num_points
# branch
# if loss_cls.use_sigmoid:
if
loss_cls
[
'use_sigmoid'
]:
self
.
cls_out_channels
=
num_classes
else
:
self
.
cls_out_channels
=
num_classes
+
1
self
.
iterative
=
iterative
self
.
num_reg_fcs
=
num_reg_fcs
if
patch_size
is
not
None
:
self
.
register_buffer
(
'patch_size'
,
torch
.
tensor
(
(
patch_size
[
1
],
patch_size
[
0
])),)
self
.
_build_transformer
(
transformer
,
positional_encoding
)
# loss params
self
.
loss_cls
=
build_loss
(
loss_cls
)
self
.
bg_cls_weight
=
0.1
if
self
.
loss_cls
.
use_sigmoid
:
self
.
bg_cls_weight
=
0.0
self
.
sync_cls_avg_factor
=
sync_cls_avg_factor
self
.
reg_loss
=
build_loss
(
loss_reg
)
# add reg, cls head for each decoder layer
self
.
_init_layers
()
self
.
_init_branch
()
self
.
init_weights
()
def
_init_layers
(
self
):
"""Initialize some layer."""
self
.
input_proj
=
Conv2d
(
self
.
in_channels
,
self
.
embed_dims
,
kernel_size
=
1
)
# query_pos_embed & query_embed
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
)
def
_build_transformer
(
self
,
transformer
,
positional_encoding
):
# transformer
self
.
act_cfg
=
transformer
.
get
(
'act_cfg'
,
dict
(
type
=
'ReLU'
,
inplace
=
True
))
self
.
activate
=
build_activation_layer
(
self
.
act_cfg
)
self
.
positional_encoding
=
build_positional_encoding
(
positional_encoding
)
self
.
transformer
=
build_transformer
(
transformer
)
self
.
embed_dims
=
self
.
transformer
.
embed_dims
def
_init_branch
(
self
,):
"""Initialize classification branch and regression branch of head."""
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
num_points
*
2
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
# add sigmoid or not
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
num_pred
=
self
.
transformer
.
decoder
.
num_layers
if
self
.
iterative
:
fc_cls
=
_get_clones
(
fc_cls
,
num_pred
)
reg_branch
=
_get_clones
(
reg_branch
,
num_pred
)
self
.
pre_branches
=
nn
.
ModuleDict
([
(
'cls'
,
fc_cls
),
(
'reg'
,
reg_branch
),
])
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
for
p
in
self
.
input_proj
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
self
.
transformer
.
init_weights
()
# init prediction branch
for
k
,
v
in
self
.
pre_branches
.
items
():
for
param
in
v
.
parameters
():
if
param
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
param
)
# focal loss init
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
# for last layer
if
isinstance
(
self
.
pre_branches
[
'cls'
],
nn
.
ModuleList
):
for
m
in
self
.
pre_branches
[
'cls'
]:
nn
.
init
.
constant_
(
m
.
bias
,
bias_init
)
else
:
m
=
self
.
pre_branches
[
'cls'
]
nn
.
init
.
constant_
(
m
.
bias
,
bias_init
)
def
forward
(
self
,
bev_feature
,
img_metas
=
None
):
'''
Args:
bev_feature (List[Tensor]): shape [B, C, H, W]
feature in bev view
img_metas
Outs:
preds_dict (Dict):
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
'''
x
=
bev_feature
[
0
]
x
=
self
.
input_proj
(
x
)
# only change feature size
B
,
C
,
H
,
W
=
x
.
shape
masks
=
x
.
new_zeros
((
B
,
H
,
W
))
pos_embed
=
self
.
positional_encoding
(
masks
)
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
.
type
(
torch
.
bool
),
self
.
query_embedding
.
weight
,
pos_embed
)
outputs
=
[]
for
i
,
query_feat
in
enumerate
(
outs_dec
):
ocls
=
self
.
pre_branches
[
'cls'
](
query_feat
)
oreg
=
self
.
pre_branches
[
'reg'
](
query_feat
)
oreg
=
oreg
.
unflatten
(
dim
=
2
,
sizes
=
(
self
.
num_points
,
2
))
oreg
[...,
0
:
2
]
=
oreg
[...,
0
:
2
].
sigmoid
()
# normalized xyz
outputs
.
append
(
dict
(
lines
=
oreg
,
# [bs, num_query, num_points, 2]
scores
=
ocls
,
# [bs, num_query, num_class]
)
)
return
outputs
@
force_fp32
(
apply_to
=
(
'score_pred'
,
'lines_pred'
,
'gt_lines'
))
def
_get_target_single
(
self
,
score_pred
,
lines_pred
,
gt_lines
,
gt_labels
,
gt_bboxes_ignore
=
None
):
"""
Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
lines_pred (Tensor):
shape [num_query, num_points, 2].
gt_lines (Tensor):
shape [num_gt, num_points, 2].
gt_labels (torch.LongTensor)
shape [num_gt, ]
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (LongTensor): Labels of each image.
shape [num_query, 1]
- label_weights (Tensor]): Label weights of each image.
shape [num_query, 1]
- lines_target (Tensor): Lines targets of each image.
shape [num_query, num_points, 2]
- lines_weights (Tensor): Lines weights of each image.
shape [num_query, num_points, 2]
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_pred_lines
=
lines_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
preds
=
dict
(
lines
=
lines_pred
,
scores
=
score_pred
,),
gts
=
dict
(
lines
=
gt_lines
,
labels
=
gt_labels
,
),
gt_bboxes_ignore
=
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
lines_pred
,
gt_lines
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
# label targets
labels
=
gt_lines
.
new_full
((
num_pred_lines
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_lines
.
new_ones
(
num_pred_lines
)
# bbox targets
lines_target
=
torch
.
zeros_like
(
lines_pred
)
lines_target
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
lines_weights
=
torch
.
zeros_like
(
lines_pred
)
lines_weights
[
pos_inds
]
=
1.0
return
(
labels
,
label_weights
,
lines_target
,
lines_weights
,
pos_inds
,
neg_inds
)
@
force_fp32
(
apply_to
=
(
'preds'
,
'gts'
))
def
get_targets
(
self
,
preds
,
gts
,
gt_bboxes_ignore_list
=
None
):
"""
Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
lines_preds_list (list[Tensor]): [num_query, num_points, 2].
gt_lines_list (list[Tensor]): Ground truth lines for each image
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all \
images.
- lines_targets_list (list[Tensor]): Lines targets for all \
images.
- lines_weight_list (list[Tensor]): Lines weights for all \
images.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
(
labels_list
,
label_weights_list
,
lines_targets_list
,
lines_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
preds
[
'scores'
],
preds
[
'lines'
],
gts
[
'lines'
],
gts
[
'labels'
],
gt_bboxes_ignore
=
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
new_gts
=
dict
(
labels
=
labels_list
,
label_weights
=
label_weights_list
,
lines_targets
=
lines_targets_list
,
lines_weights
=
lines_weights_list
,
)
return
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
@
force_fp32
(
apply_to
=
(
'preds'
,
'gts'
))
def
loss_single
(
self
,
preds
:
dict
,
gts
:
dict
,
gt_bboxes_ignore_list
=
None
,
reduction
=
'none'
):
"""
Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
lines_preds (Tensor):
shape [bs, num_query, num_points, 2].
gt_lines_list (list[Tensor]):
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
# get target for each sample
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
=
\
self
.
get_targets
(
preds
,
gts
,
gt_bboxes_ignore_list
)
# batched all data
for
k
,
v
in
new_gts
.
items
():
new_gts
[
k
]
=
torch
.
cat
(
v
,
0
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
preds
[
'scores'
].
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
# classification loss
cls_scores
=
preds
[
'scores'
].
reshape
(
-
1
,
self
.
cls_out_channels
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
new_gts
[
'labels'
],
new_gts
[
'label_weights'
],
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# regression L1 loss
lines_preds
=
preds
[
'lines'
].
reshape
(
-
1
,
self
.
num_points
,
2
)
if
reduction
==
'none'
:
# For performance analysis
loss_reg
=
self
.
reg_loss
(
lines_preds
,
new_gts
[
'lines_targets'
],
new_gts
[
'lines_weights'
],
reduction_override
=
reduction
,
avg_factor
=
num_total_pos
)
else
:
loss_reg
=
self
.
reg_loss
(
lines_preds
,
new_gts
[
'lines_targets'
],
new_gts
[
'lines_weights'
],
avg_factor
=
num_total_pos
)
loss_dict
=
dict
(
cls
=
loss_cls
,
reg
=
loss_reg
,
)
return
(
loss_dict
,
pos_inds_list
)
@
force_fp32
(
apply_to
=
(
'gt_lines_list'
,
'preds_dicts'
))
def
loss
(
self
,
gts
:
dict
,
preds_dicts
:
dict
,
gt_bboxes_ignore
=
None
,
reduction
=
'mean'
):
"""
Loss Function.
Args:
gt_lines_list (list[Tensor]): Ground truth lines for each image
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
# Since there might have multi layer
losses
,
pos_inds_lists
,
pos_gt_inds_lists
=
multi_apply
(
self
.
loss_single
,
preds_dicts
,
gts
=
gts
,
gt_bboxes_ignore_list
=
gt_bboxes_ignore
,
reduction
=
reduction
)
# Format the losses
loss_dict
=
dict
()
# loss from the last decoder layer
for
k
,
v
in
losses
[
-
1
].
items
():
loss_dict
[
k
]
=
v
# Loss from other decoder layers
num_dec_layer
=
0
for
loss
in
losses
[:
-
1
]:
for
k
,
v
in
loss
.
items
():
loss_dict
[
f
'd
{
num_dec_layer
}
.
{
k
}
'
]
=
v
num_dec_layer
+=
1
return
loss_dict
,
pos_inds_lists
,
pos_gt_inds_lists
def
post_process
(
self
,
preds_dict
,
tokens
,
gts
):
'''
Args:
preds_dict:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
Outs:
ret_list (List[Dict]) with length as bs
list of result dict for each sample in the batch
Dict keys:
'lines': numpy.array of shape [num_pred, num_points, 2]
'scores': numpy.array of shape [num_pred, ]
after sigmoid
'labels': numpy.array of shape [num_pred, ]
dtype=long
'''
preds
=
preds_dict
[
-
1
]
batched_cls_scores
=
preds
[
'scores'
]
batched_lines_preds
=
preds
[
'lines'
]
batch_size
=
batched_cls_scores
.
size
(
0
)
ret_list
=
[]
for
i
in
range
(
len
(
tokens
)):
cls_scores
=
batched_cls_scores
[
i
]
lines_preds
=
batched_lines_preds
[
i
]
max_num
=
self
.
max_lines
if
cls_scores
.
shape
[
-
1
]
>
self
.
num_classes
:
scores
,
labels
=
F
.
softmax
(
cls_scores
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
final_scores
,
bbox_index
=
scores
.
topk
(
self
.
max_lines
)
final_lines
=
lines_preds
[
bbox_index
]
final_labels
=
labels
[
bbox_index
]
else
:
cls_scores
=
cls_scores
.
sigmoid
()
final_scores
,
indexes
=
cls_scores
.
view
(
-
1
).
topk
(
self
.
max_lines
)
final_labels
=
indexes
%
self
.
num_classes
bbox_index
=
indexes
//
self
.
num_classes
final_lines
=
lines_preds
[
bbox_index
]
ret_dict_single
=
{
'token'
:
tokens
[
i
],
'lines'
:
final_lines
.
detach
().
cpu
().
numpy
()
*
2
-
1
,
'scores'
:
final_scores
.
detach
().
cpu
().
numpy
(),
'labels'
:
final_labels
.
detach
().
cpu
().
numpy
(),
'nline'
:
len
(
final_lines
),
}
if
gts
is
not
None
:
lines_gt
=
gts
[
'lines'
][
i
].
detach
().
cpu
().
numpy
()
labels_gt
=
gts
[
'labels'
][
i
].
detach
().
cpu
().
numpy
()
ret_dict_single
[
'groundTruth'
]
=
{
'token'
:
tokens
[
i
],
'nline'
:
lines_gt
.
shape
[
0
],
'labels'
:
labels_gt
,
'lines'
:
lines_gt
*
2
-
1
,
}
# if (labels_gt==1).any():
# import ipdb; ipdb.set_trace()
ret_list
.
append
(
ret_dict_single
)
return
ret_list
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
copy
from
mmdet.models
import
HEADS
from
mmcv.cnn
import
Conv2d
from
mmcv.cnn
import
Linear
,
build_activation_layer
,
bias_init_with_prob
from
mmcv.cnn.bricks.transformer
import
build_positional_encoding
from
mmdet.models.utils
import
build_transformer
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
multi_apply
,
build_assigner
,
build_sampler
,
reduce_mean
)
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models
import
build_loss
from
.base_map_head
import
BaseMapHead
@
HEADS
.
register_module
()
class
DETRMapFixedNumHead
(
BaseMapHead
):
def
__init__
(
self
,
num_classes
=
3
,
in_channels
=
128
,
num_query
=
100
,
max_lines
=
50
,
score_thre
=
0.2
,
num_reg_fcs
=
2
,
num_points
=
100
,
iterative
=
False
,
patch_size
=
None
,
sync_cls_avg_factor
=
True
,
transformer
:
dict
=
None
,
positional_encoding
:
dict
=
None
,
loss_cls
:
dict
=
None
,
loss_reg
:
dict
=
None
,
train_cfg
:
dict
=
None
,
init_cfg
=
None
,
**
kwargs
):
super
().
__init__
()
assigner
=
train_cfg
[
'assigner'
]
self
.
assigner
=
build_assigner
(
assigner
)
# DETR sampling=False, so use PseudoSampler
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
train_cfg
=
train_cfg
self
.
max_lines
=
max_lines
self
.
score_thre
=
score_thre
self
.
num_query
=
num_query
self
.
in_channels
=
in_channels
self
.
num_classes
=
num_classes
self
.
num_points
=
num_points
# branch
# if loss_cls.use_sigmoid:
if
loss_cls
[
'use_sigmoid'
]:
self
.
cls_out_channels
=
num_classes
else
:
self
.
cls_out_channels
=
num_classes
+
1
self
.
iterative
=
iterative
self
.
num_reg_fcs
=
num_reg_fcs
if
patch_size
is
not
None
:
self
.
register_buffer
(
'patch_size'
,
torch
.
tensor
(
(
patch_size
[
1
],
patch_size
[
0
])),)
self
.
_build_transformer
(
transformer
,
positional_encoding
)
# loss params
self
.
loss_cls
=
build_loss
(
loss_cls
)
self
.
bg_cls_weight
=
0.1
if
self
.
loss_cls
.
use_sigmoid
:
self
.
bg_cls_weight
=
0.0
self
.
sync_cls_avg_factor
=
sync_cls_avg_factor
self
.
reg_loss
=
build_loss
(
loss_reg
)
# add reg, cls head for each decoder layer
self
.
_init_layers
()
self
.
_init_branch
()
self
.
init_weights
()
def
_init_layers
(
self
):
"""Initialize some layer."""
self
.
input_proj
=
Conv2d
(
self
.
in_channels
,
self
.
embed_dims
,
kernel_size
=
1
)
# query_pos_embed & query_embed
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
)
def
_build_transformer
(
self
,
transformer
,
positional_encoding
):
# transformer
self
.
act_cfg
=
transformer
.
get
(
'act_cfg'
,
dict
(
type
=
'ReLU'
,
inplace
=
True
))
self
.
activate
=
build_activation_layer
(
self
.
act_cfg
)
self
.
positional_encoding
=
build_positional_encoding
(
positional_encoding
)
self
.
transformer
=
build_transformer
(
transformer
)
self
.
embed_dims
=
self
.
transformer
.
embed_dims
def
_init_branch
(
self
,):
"""Initialize classification branch and regression branch of head."""
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
num_points
*
2
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
# add sigmoid or not
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
num_pred
=
self
.
transformer
.
decoder
.
num_layers
if
self
.
iterative
:
fc_cls
=
_get_clones
(
fc_cls
,
num_pred
)
reg_branch
=
_get_clones
(
reg_branch
,
num_pred
)
self
.
pre_branches
=
nn
.
ModuleDict
([
(
'cls'
,
fc_cls
),
(
'reg'
,
reg_branch
),
])
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
for
p
in
self
.
input_proj
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
self
.
transformer
.
init_weights
()
# init prediction branch
for
k
,
v
in
self
.
pre_branches
.
items
():
for
param
in
v
.
parameters
():
if
param
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
param
)
# focal loss init
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
# for last layer
if
isinstance
(
self
.
pre_branches
[
'cls'
],
nn
.
ModuleList
):
for
m
in
self
.
pre_branches
[
'cls'
]:
nn
.
init
.
constant_
(
m
.
bias
,
bias_init
)
else
:
m
=
self
.
pre_branches
[
'cls'
]
nn
.
init
.
constant_
(
m
.
bias
,
bias_init
)
def
forward
(
self
,
bev_feature
,
img_metas
=
None
):
'''
Args:
bev_feature (List[Tensor]): shape [B, C, H, W]
feature in bev view
img_metas
Outs:
preds_dict (Dict):
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
'''
x
=
bev_feature
[
0
]
x
=
self
.
input_proj
(
x
)
# only change feature size
B
,
C
,
H
,
W
=
x
.
shape
masks
=
x
.
new_zeros
((
B
,
H
,
W
))
pos_embed
=
self
.
positional_encoding
(
masks
)
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
.
type
(
torch
.
bool
),
self
.
query_embedding
.
weight
,
pos_embed
)
outputs
=
[]
for
i
,
query_feat
in
enumerate
(
outs_dec
):
ocls
=
self
.
pre_branches
[
'cls'
](
query_feat
)
oreg
=
self
.
pre_branches
[
'reg'
](
query_feat
)
oreg
=
oreg
.
unflatten
(
dim
=
2
,
sizes
=
(
self
.
num_points
,
2
))
oreg
[...,
0
:
2
]
=
oreg
[...,
0
:
2
].
sigmoid
()
# normalized xyz
outputs
.
append
(
dict
(
lines
=
oreg
,
# [bs, num_query, num_points, 2]
scores
=
ocls
,
# [bs, num_query, num_class]
)
)
return
outputs
@
force_fp32
(
apply_to
=
(
'score_pred'
,
'lines_pred'
,
'gt_lines'
))
def
_get_target_single
(
self
,
score_pred
,
lines_pred
,
gt_lines
,
gt_labels
,
gt_bboxes_ignore
=
None
):
"""
Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
lines_pred (Tensor):
shape [num_query, num_points, 2].
gt_lines (Tensor):
shape [num_gt, num_points, 2].
gt_labels (torch.LongTensor)
shape [num_gt, ]
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (LongTensor): Labels of each image.
shape [num_query, 1]
- label_weights (Tensor]): Label weights of each image.
shape [num_query, 1]
- lines_target (Tensor): Lines targets of each image.
shape [num_query, num_points, 2]
- lines_weights (Tensor): Lines weights of each image.
shape [num_query, num_points, 2]
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_pred_lines
=
lines_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
preds
=
dict
(
lines
=
lines_pred
,
scores
=
score_pred
,),
gts
=
dict
(
lines
=
gt_lines
,
labels
=
gt_labels
,
),
gt_bboxes_ignore
=
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
lines_pred
,
gt_lines
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
# label targets
labels
=
gt_lines
.
new_full
((
num_pred_lines
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_lines
.
new_ones
(
num_pred_lines
)
# bbox targets
lines_target
=
torch
.
zeros_like
(
lines_pred
)
lines_target
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
lines_weights
=
torch
.
zeros_like
(
lines_pred
)
lines_weights
[
pos_inds
]
=
1.0
return
(
labels
,
label_weights
,
lines_target
,
lines_weights
,
pos_inds
,
neg_inds
)
@
force_fp32
(
apply_to
=
(
'preds'
,
'gts'
))
def
get_targets
(
self
,
preds
,
gts
,
gt_bboxes_ignore_list
=
None
):
"""
Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
lines_preds_list (list[Tensor]): [num_query, num_points, 2].
gt_lines_list (list[Tensor]): Ground truth lines for each image
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- lines_targets_list (list[Tensor]): Lines targets for all
\
images.
- lines_weight_list (list[Tensor]): Lines weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
(
labels_list
,
label_weights_list
,
lines_targets_list
,
lines_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
preds
[
'scores'
],
preds
[
'lines'
],
gts
[
'lines'
],
gts
[
'labels'
],
gt_bboxes_ignore
=
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
new_gts
=
dict
(
labels
=
labels_list
,
label_weights
=
label_weights_list
,
lines_targets
=
lines_targets_list
,
lines_weights
=
lines_weights_list
,
)
return
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
@
force_fp32
(
apply_to
=
(
'preds'
,
'gts'
))
def
loss_single
(
self
,
preds
:
dict
,
gts
:
dict
,
gt_bboxes_ignore_list
=
None
,
reduction
=
'none'
):
"""
Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
lines_preds (Tensor):
shape [bs, num_query, num_points, 2].
gt_lines_list (list[Tensor]):
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
# get target for each sample
new_gts
,
num_total_pos
,
num_total_neg
,
pos_inds_list
=
\
self
.
get_targets
(
preds
,
gts
,
gt_bboxes_ignore_list
)
# batched all data
for
k
,
v
in
new_gts
.
items
():
new_gts
[
k
]
=
torch
.
cat
(
v
,
0
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
preds
[
'scores'
].
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
# classification loss
cls_scores
=
preds
[
'scores'
].
reshape
(
-
1
,
self
.
cls_out_channels
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
new_gts
[
'labels'
],
new_gts
[
'label_weights'
],
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# regression L1 loss
lines_preds
=
preds
[
'lines'
].
reshape
(
-
1
,
self
.
num_points
,
2
)
if
reduction
==
'none'
:
# For performance analysis
loss_reg
=
self
.
reg_loss
(
lines_preds
,
new_gts
[
'lines_targets'
],
new_gts
[
'lines_weights'
],
reduction_override
=
reduction
,
avg_factor
=
num_total_pos
)
else
:
loss_reg
=
self
.
reg_loss
(
lines_preds
,
new_gts
[
'lines_targets'
],
new_gts
[
'lines_weights'
],
avg_factor
=
num_total_pos
)
loss_dict
=
dict
(
cls
=
loss_cls
,
reg
=
loss_reg
,
)
return
(
loss_dict
,
pos_inds_list
)
@
force_fp32
(
apply_to
=
(
'gt_lines_list'
,
'preds_dicts'
))
def
loss
(
self
,
gts
:
dict
,
preds_dicts
:
dict
,
gt_bboxes_ignore
=
None
,
reduction
=
'mean'
):
"""
Loss Function.
Args:
gt_lines_list (list[Tensor]): Ground truth lines for each image
with shape (num_gts, num_points, 2)
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
# Since there might have multi layer
losses
,
pos_inds_lists
,
pos_gt_inds_lists
=
multi_apply
(
self
.
loss_single
,
preds_dicts
,
gts
=
gts
,
gt_bboxes_ignore_list
=
gt_bboxes_ignore
,
reduction
=
reduction
)
# Format the losses
loss_dict
=
dict
()
# loss from the last decoder layer
for
k
,
v
in
losses
[
-
1
].
items
():
loss_dict
[
k
]
=
v
# Loss from other decoder layers
num_dec_layer
=
0
for
loss
in
losses
[:
-
1
]:
for
k
,
v
in
loss
.
items
():
loss_dict
[
f
'd
{
num_dec_layer
}
.
{
k
}
'
]
=
v
num_dec_layer
+=
1
return
loss_dict
,
pos_inds_lists
,
pos_gt_inds_lists
def
post_process
(
self
,
preds_dict
,
tokens
,
gts
):
'''
Args:
preds_dict:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lines_preds (Tensor):
[nb_dec, bs, num_query, num_points, 2].
Outs:
ret_list (List[Dict]) with length as bs
list of result dict for each sample in the batch
Dict keys:
'lines': numpy.array of shape [num_pred, num_points, 2]
'scores': numpy.array of shape [num_pred, ]
after sigmoid
'labels': numpy.array of shape [num_pred, ]
dtype=long
'''
preds
=
preds_dict
[
-
1
]
batched_cls_scores
=
preds
[
'scores'
]
batched_lines_preds
=
preds
[
'lines'
]
batch_size
=
batched_cls_scores
.
size
(
0
)
ret_list
=
[]
for
i
in
range
(
len
(
tokens
)):
cls_scores
=
batched_cls_scores
[
i
]
lines_preds
=
batched_lines_preds
[
i
]
max_num
=
self
.
max_lines
if
cls_scores
.
shape
[
-
1
]
>
self
.
num_classes
:
scores
,
labels
=
F
.
softmax
(
cls_scores
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
final_scores
,
bbox_index
=
scores
.
topk
(
self
.
max_lines
)
final_lines
=
lines_preds
[
bbox_index
]
final_labels
=
labels
[
bbox_index
]
else
:
cls_scores
=
cls_scores
.
sigmoid
()
final_scores
,
indexes
=
cls_scores
.
view
(
-
1
).
topk
(
self
.
max_lines
)
final_labels
=
indexes
%
self
.
num_classes
bbox_index
=
indexes
//
self
.
num_classes
final_lines
=
lines_preds
[
bbox_index
]
ret_dict_single
=
{
'token'
:
tokens
[
i
],
'lines'
:
final_lines
.
detach
().
cpu
().
numpy
()
*
2
-
1
,
'scores'
:
final_scores
.
detach
().
cpu
().
numpy
(),
'labels'
:
final_labels
.
detach
().
cpu
().
numpy
(),
'nline'
:
len
(
final_lines
),
}
if
gts
is
not
None
:
lines_gt
=
gts
[
'lines'
][
i
].
detach
().
cpu
().
numpy
()
labels_gt
=
gts
[
'labels'
][
i
].
detach
().
cpu
().
numpy
()
ret_dict_single
[
'groundTruth'
]
=
{
'token'
:
tokens
[
i
],
'nline'
:
lines_gt
.
shape
[
0
],
'labels'
:
labels_gt
,
'lines'
:
lines_gt
*
2
-
1
,
}
# if (labels_gt==1).any():
# import ipdb; ipdb.set_trace()
ret_list
.
append
(
ret_dict_single
)
return
ret_list
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment