Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
cce49ba9
Unverified
Commit
cce49ba9
authored
Apr 21, 2023
by
Chengyu Wang
Committed by
GitHub
Apr 21, 2023
Browse files
Add openlane v2 (#121)
parent
dbf29e61
Changes
93
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
2936 additions
and
0 deletions
+2936
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/multi_scale_deformable_attn_function.py
...ne/models/modules/multi_scale_deformable_attn_function.py
+163
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
...mdet3d/baseline/models/modules/spatial_cross_attention.py
+398
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
...mdet3d/baseline/models/modules/temporal_self_attention.py
+275
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
...-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
+139
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
...nlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
+2
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
...ane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
+224
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
...et3d/baseline/models/necks/custom_ipm_view_transformer.py
+195
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
...us_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
+210
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
...ving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
+353
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
...iving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
+360
-0
autonomous_driving/openlane-v2/requirements.txt
autonomous_driving/openlane-v2/requirements.txt
+13
-0
autonomous_driving/openlane-v2/setup.py
autonomous_driving/openlane-v2/setup.py
+35
-0
autonomous_driving/openlane-v2/tutorial.ipynb
autonomous_driving/openlane-v2/tutorial.ipynb
+569
-0
No files found.
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/multi_scale_deformable_attn_function.py
0 → 100644
View file @
cce49ba9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
torch
from
torch.cuda.amp
import
custom_bwd
,
custom_fwd
from
torch.autograd.function
import
Function
,
once_differentiable
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
class
MultiScaleDeformableAttnFunction_fp16
(
Function
):
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
class
MultiScaleDeformableAttnFunction_fp32
(
Function
):
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float32
)
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
0 → 100644
View file @
cce49ba9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
mmcv.ops.multi_scale_deform_attn
import
multi_scale_deformable_attn_pytorch
import
warnings
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
xavier_init
,
constant_init
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
from
mmcv.cnn.bricks.transformer
import
build_attention
import
math
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
ext_loader
from
.multi_scale_deformable_attn_function
import
MultiScaleDeformableAttnFunction_fp32
,
\
MultiScaleDeformableAttnFunction_fp16
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
@
ATTENTION
.
register_module
()
class
SpatialCrossAttention
(
BaseModule
):
"""An attention module used in BEVFormer.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_cams (int): The number of cameras
dropout (float): A Dropout layer on `inp_residual`.
Default: 0..
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
deformable_attention: (dict): The config for the deformable attention used in SCA.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_cams
=
6
,
pc_range
=
None
,
dropout
=
0.1
,
init_cfg
=
None
,
batch_first
=
False
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
256
,
num_levels
=
4
),
**
kwargs
):
super
(
SpatialCrossAttention
,
self
).
__init__
(
init_cfg
)
self
.
init_cfg
=
init_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
pc_range
=
pc_range
self
.
fp16_enabled
=
False
self
.
deformable_attention
=
build_attention
(
deformable_attention
)
self
.
embed_dims
=
embed_dims
self
.
num_cams
=
num_cams
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
batch_first
=
batch_first
self
.
init_weight
()
def
init_weight
(
self
):
"""Default initialization for Parameters of Module."""
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
@
force_fp32
(
apply_to
=
(
'query'
,
'key'
,
'value'
,
'query_pos'
,
'reference_points_cam'
))
def
forward
(
self
,
query
,
key
,
value
,
residual
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
reference_points_cam
=
None
,
bev_mask
=
None
,
level_start_index
=
None
,
flag
=
'encoder'
,
**
kwargs
):
"""Forward Function of Detr3DCrossAtten.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`. (B, N, C, H, W)
residual (Tensor): The tensor used for addition, with the
same shape as `x`. Default None. If None, `x` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, 4),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different level. With shape (num_levels, 2),
last dimension represent (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape (num_levels) and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
key
is
None
:
key
=
query
if
value
is
None
:
value
=
key
if
residual
is
None
:
inp_residual
=
query
slots
=
torch
.
zeros_like
(
query
)
if
query_pos
is
not
None
:
query
=
query
+
query_pos
bs
,
num_query
,
_
=
query
.
size
()
D
=
reference_points_cam
.
size
(
3
)
indexes
=
[]
for
i
,
mask_per_img
in
enumerate
(
bev_mask
):
index_query_per_img
=
mask_per_img
[
0
].
sum
(
-
1
).
nonzero
().
squeeze
(
-
1
)
indexes
.
append
(
index_query_per_img
)
max_len
=
max
([
len
(
each
)
for
each
in
indexes
])
# each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory.
queries_rebatch
=
query
.
new_zeros
(
[
bs
,
self
.
num_cams
,
max_len
,
self
.
embed_dims
])
reference_points_rebatch
=
reference_points_cam
.
new_zeros
(
[
bs
,
self
.
num_cams
,
max_len
,
D
,
2
])
for
j
in
range
(
bs
):
for
i
,
reference_points_per_img
in
enumerate
(
reference_points_cam
):
index_query_per_img
=
indexes
[
i
]
queries_rebatch
[
j
,
i
,
:
len
(
index_query_per_img
)]
=
query
[
j
,
index_query_per_img
]
reference_points_rebatch
[
j
,
i
,
:
len
(
index_query_per_img
)]
=
reference_points_per_img
[
j
,
index_query_per_img
]
num_cams
,
l
,
bs
,
embed_dims
=
key
.
shape
key
=
key
.
permute
(
2
,
0
,
1
,
3
).
reshape
(
bs
*
self
.
num_cams
,
l
,
self
.
embed_dims
)
value
=
value
.
permute
(
2
,
0
,
1
,
3
).
reshape
(
bs
*
self
.
num_cams
,
l
,
self
.
embed_dims
)
queries
=
self
.
deformable_attention
(
query
=
queries_rebatch
.
view
(
bs
*
self
.
num_cams
,
max_len
,
self
.
embed_dims
),
key
=
key
,
value
=
value
,
reference_points
=
reference_points_rebatch
.
view
(
bs
*
self
.
num_cams
,
max_len
,
D
,
2
),
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
).
view
(
bs
,
self
.
num_cams
,
max_len
,
self
.
embed_dims
)
for
j
in
range
(
bs
):
for
i
,
index_query_per_img
in
enumerate
(
indexes
):
slots
[
j
,
index_query_per_img
]
+=
queries
[
j
,
i
,
:
len
(
index_query_per_img
)]
count
=
bev_mask
.
sum
(
-
1
)
>
0
count
=
count
.
permute
(
1
,
2
,
0
).
sum
(
-
1
)
count
=
torch
.
clamp
(
count
,
min
=
1.0
)
slots
=
slots
/
count
[...,
None
]
slots
=
self
.
output_proj
(
slots
)
return
self
.
dropout
(
slots
)
+
inp_residual
@
ATTENTION
.
register_module
()
class
MSDeformableAttention3D
(
BaseModule
):
"""An attention module used in BEVFormer based on Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
8
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
True
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
batch_first
=
batch_first
self
.
output_proj
=
None
self
.
fp16_enabled
=
False
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
( bs, num_query, embed_dims).
key (Tensor): The key tensor with shape
`(bs, num_key, embed_dims)`.
value (Tensor): The value tensor with shape
`(bs, num_key, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
value
=
query
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
_
=
query
.
shape
bs
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
view
(
bs
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
)
if
reference_points
.
shape
[
-
1
]
==
2
:
"""
For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
For each referent point, we sample `num_points` sampling points.
For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points.
"""
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
bs
,
num_query
,
num_Z_anchors
,
xy
=
reference_points
.
shape
reference_points
=
reference_points
[:,
:,
None
,
None
,
None
,
:,
:]
sampling_offsets
=
sampling_offsets
/
\
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
bs
,
num_query
,
num_heads
,
num_levels
,
num_all_points
,
xy
=
sampling_offsets
.
shape
sampling_offsets
=
sampling_offsets
.
view
(
bs
,
num_query
,
num_heads
,
num_levels
,
num_all_points
//
num_Z_anchors
,
num_Z_anchors
,
xy
)
sampling_locations
=
reference_points
+
sampling_offsets
bs
,
num_query
,
num_heads
,
num_levels
,
num_points
,
num_Z_anchors
,
xy
=
sampling_locations
.
shape
assert
num_all_points
==
num_points
*
num_Z_anchors
sampling_locations
=
sampling_locations
.
view
(
bs
,
num_query
,
num_heads
,
num_levels
,
num_all_points
,
xy
)
elif
reference_points
.
shape
[
-
1
]
==
4
:
assert
False
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
# sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
# attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
#
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
if
value
.
dtype
==
torch
.
float16
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
else
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
if
not
self
.
batch_first
:
output
=
output
.
permute
(
1
,
0
,
2
)
return
output
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
0 → 100644
View file @
cce49ba9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.multi_scale_deformable_attn_function
import
MultiScaleDeformableAttnFunction_fp32
from
mmcv.ops.multi_scale_deform_attn
import
multi_scale_deformable_attn_pytorch
import
warnings
import
torch
import
torch.nn
as
nn
from
mmcv.cnn
import
xavier_init
,
constant_init
from
mmcv.cnn.bricks.registry
import
ATTENTION
import
math
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
(
ConfigDict
,
build_from_cfg
,
deprecated_api_warning
,
to_2tuple
)
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
@
ATTENTION
.
register_module
()
class
TemporalSelfAttention
(
BaseModule
):
"""An attention module used in BEVFormer based on Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to True.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
the length of BEV queue is 2.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
4
,
num_bev_queue
=
2
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
True
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
batch_first
=
batch_first
self
.
fp16_enabled
=
False
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
num_bev_queue
=
num_bev_queue
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
*
self
.
num_bev_queue
,
num_bev_queue
*
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
*
self
.
num_bev_queue
,
num_bev_queue
*
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
*
self
.
num_bev_queue
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
flag
=
'decoder'
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
assert
self
.
batch_first
bs
,
len_bev
,
c
=
query
.
shape
value
=
torch
.
stack
([
query
,
query
],
1
).
reshape
(
bs
*
2
,
len_bev
,
c
)
# value = torch.cat([query, query], 0)
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
embed_dims
=
query
.
shape
_
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
assert
self
.
num_bev_queue
==
2
query
=
torch
.
cat
([
value
[::
2
],
query
],
-
1
)
value_
=
value
.
clone
()
value_
[:
bs
]
=
value
[::
2
]
value_
[
bs
:]
=
value
[
1
::
2
]
value
=
self
.
value_proj
(
value
)
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
reshape
(
bs
*
self
.
num_bev_queue
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
)
sampling_offsets
=
sampling_offsets
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_bev_queue
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_bev_queue
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_bev_queue
,
self
.
num_levels
,
self
.
num_points
)
attention_weights
=
attention_weights
.
permute
(
0
,
3
,
1
,
2
,
4
,
5
)
\
.
reshape
(
bs
*
self
.
num_bev_queue
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
).
contiguous
()
sampling_offsets
=
sampling_offsets
.
permute
(
0
,
3
,
1
,
2
,
4
,
5
,
6
)
\
.
reshape
(
bs
*
self
.
num_bev_queue
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
if
reference_points
.
shape
[
-
1
]
==
2
:
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:]
\
+
sampling_offsets
\
/
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
elif
reference_points
.
shape
[
-
1
]
==
4
:
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:
2
]
\
+
sampling_offsets
/
self
.
num_points
\
*
reference_points
[:,
:,
None
,
:,
None
,
2
:]
\
*
0.5
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
# using fp16 deformable attention is unstable because it performs many sum operations
if
value
.
dtype
==
torch
.
float16
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
else
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
# output shape (bs*num_bev_queue, num_query, embed_dims)
# (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
output
=
output
.
permute
(
1
,
2
,
0
)
# fuse history value and current value
# (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
output
=
output
.
view
(
num_query
,
embed_dims
,
bs
,
self
.
num_bev_queue
)
output
=
output
.
mean
(
-
1
)
# (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
output
=
output
.
permute
(
2
,
0
,
1
)
output
=
self
.
output_proj
(
output
)
if
not
self
.
batch_first
:
output
=
output
.
permute
(
1
,
0
,
2
)
return
self
.
dropout
(
output
)
+
identity
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
0 → 100644
View file @
cce49ba9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
mmcv.cnn
import
xavier_init
from
mmcv.cnn.bricks.transformer
import
build_transformer_layer_sequence
from
mmcv.runner.base_module
import
BaseModule
from
mmdet.models.utils.builder
import
TRANSFORMER
from
torch.nn.init
import
normal_
from
mmcv.runner.base_module
import
BaseModule
from
torchvision.transforms.functional
import
rotate
from
.temporal_self_attention
import
TemporalSelfAttention
from
.spatial_cross_attention
import
MSDeformableAttention3D
from
.decoder
import
CustomMSDeformableAttention
from
mmcv.runner
import
force_fp32
,
auto_fp16
import
pdb
@
TRANSFORMER
.
register_module
()
class
PerceptionTransformer
(
BaseModule
):
"""Implements the Detr3D transformer.
Args:
as_two_stage (bool): Generate query from encoder features.
Default: False.
num_feature_levels (int): Number of feature maps from FPN:
Default: 4.
two_stage_num_proposals (int): Number of proposals when set
`as_two_stage` as True. Default: 300.
"""
def
__init__
(
self
,
decoder
=
None
,
embed_dims
=
256
,
**
kwargs
):
super
(
PerceptionTransformer
,
self
).
__init__
(
**
kwargs
)
self
.
decoder
=
build_transformer_layer_sequence
(
decoder
)
self
.
embed_dims
=
embed_dims
self
.
fp16_enabled
=
False
self
.
init_layers
()
def
init_layers
(
self
):
"""Initialize layers of the Detr3DTransformer."""
self
.
reference_points
=
nn
.
Linear
(
self
.
embed_dims
,
3
)
def
init_weights
(
self
):
"""Initialize the transformer weights."""
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
MSDeformableAttention3D
)
or
isinstance
(
m
,
TemporalSelfAttention
)
\
or
isinstance
(
m
,
CustomMSDeformableAttention
):
try
:
m
.
init_weight
()
except
AttributeError
:
m
.
init_weights
()
xavier_init
(
self
.
reference_points
,
distribution
=
'uniform'
,
bias
=
0.
)
@
auto_fp16
(
apply_to
=
(
'mlvl_feats'
,
'bev_queries'
,
'object_query_embed'
,
'prev_bev'
,
'bev_pos'
))
def
forward
(
self
,
mlvl_feats
,
bev_embed
,
object_query_embed
,
bev_h
,
bev_w
,
reg_branches
=
None
,
cls_branches
=
None
,
**
kwargs
):
"""Forward function for `Detr3DTransformer`.
Args:
mlvl_feats (list(Tensor)): Input queries from
different level. Each element has shape
[bs, num_cams, embed_dims, h, w].
bev_queries (Tensor): (bev_h*bev_w, c)
bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
object_query_embed (Tensor): The query embedding for decoder,
with shape [num_query, c].
reg_branches (obj:`nn.ModuleList`): Regression heads for
feature maps from each decoder layer. Only would
be passed when `with_box_refine` is True. Default to None.
Returns:
tuple[Tensor]: results of decoder containing the following tensor.
- bev_embed: BEV features
- inter_states: Outputs from decoder. If
return_intermediate_dec is True output has shape \
(num_dec_layers, bs, num_query, embed_dims), else has \
shape (1, bs, num_query, embed_dims).
- init_reference_out: The initial value of reference \
points, has shape (bs, num_queries, 4).
- inter_references_out: The internal value of reference \
points in decoder, has shape \
(num_dec_layers, bs,num_query, embed_dims)
- enc_outputs_class: The classification score of \
proposals generated from \
encoder's feature maps, has shape \
(batch, h*w, num_classes). \
Only would be returned when `as_two_stage` is True, \
otherwise None.
- enc_outputs_coord_unact: The regression results \
generated from encoder's feature maps., has shape \
(batch, h*w, 4). Only would \
be returned when `as_two_stage` is True, \
otherwise None.
"""
bs
=
mlvl_feats
[
0
].
size
(
0
)
query_pos
,
query
=
torch
.
split
(
object_query_embed
,
self
.
embed_dims
,
dim
=
1
)
query_pos
=
query_pos
.
unsqueeze
(
0
).
expand
(
bs
,
-
1
,
-
1
)
query
=
query
.
unsqueeze
(
0
).
expand
(
bs
,
-
1
,
-
1
)
reference_points
=
self
.
reference_points
(
query_pos
)
reference_points
=
reference_points
.
sigmoid
()
init_reference_out
=
reference_points
query
=
query
.
permute
(
1
,
0
,
2
)
query_pos
=
query_pos
.
permute
(
1
,
0
,
2
)
bev_embed
=
bev_embed
.
permute
(
1
,
0
,
2
)
inter_states
,
inter_references
=
self
.
decoder
(
query
=
query
,
key
=
None
,
value
=
bev_embed
,
query_pos
=
query_pos
,
reference_points
=
reference_points
,
reg_branches
=
reg_branches
,
cls_branches
=
cls_branches
,
spatial_shapes
=
torch
.
tensor
([[
bev_h
,
bev_w
]],
device
=
query
.
device
),
level_start_index
=
torch
.
tensor
([
0
],
device
=
query
.
device
),
**
kwargs
)
inter_references_out
=
inter_references
return
inter_states
,
init_reference_out
,
inter_references_out
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
0 → 100644
View file @
cce49ba9
from
.custom_fpn
import
*
from
.custom_ipm_view_transformer
import
*
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
0 → 100644
View file @
cce49ba9
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_fpn.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
ConvModule
from
mmcv.runner
import
BaseModule
from
mmdet3d.models
import
NECKS
@
NECKS
.
register_module
()
class
CustomFPN
(
BaseModule
):
r
"""
Notes
-----
Adapted from https://github.com/HuangJunJie2017/BEVDet/blob/dev2.0/mmdet3d/models/necks/fpn.py#L11.
Feature Pyramid Network.
This is an implementation of paper `Feature Pyramid Networks for Object
Detection <https://arxiv.org/abs/1612.03144>`_.
Args:
in_channels (List[int]): Number of input channels per scale.
out_channels (int): Number of output channels (used at each scale)
num_outs (int): Number of output scales.
start_level (int): Index of the start input backbone level used to
build the feature pyramid. Default: 0.
end_level (int): Index of the end input backbone level (exclusive) to
build the feature pyramid. Default: -1, which means the last level.
add_extra_convs (bool | str): If bool, it decides whether to add conv
layers on top of the original feature maps. Default to False.
If True, it is equivalent to `add_extra_convs='on_input'`.
If str, it specifies the source feature map of the extra convs.
Only the following options are allowed
- 'on_input': Last feat map of neck inputs (i.e. backbone feature).
- 'on_lateral': Last feature map after lateral convs.
- 'on_output': The last output feature map after fpn convs.
relu_before_extra_convs (bool): Whether to apply relu before the extra
conv. Default: False.
no_norm_on_lateral (bool): Whether to apply norm on lateral.
Default: False.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
act_cfg (str): Config dict for activation layer in ConvModule.
Default: None.
upsample_cfg (dict): Config dict for interpolate layer.
Default: `dict(mode='nearest')`
init_cfg (dict or list[dict], optional): Initialization config dict.
Example:
>>> import torch
>>> in_channels = [2, 3, 5, 7]
>>> scales = [340, 170, 84, 43]
>>> inputs = [torch.rand(1, c, s, s)
... for c, s in zip(in_channels, scales)]
>>> self = FPN(in_channels, 11, len(in_channels)).eval()
>>> outputs = self.forward(inputs)
>>> for i in range(len(outputs)):
... print(f'outputs[{i}].shape = {outputs[i].shape}')
outputs[0].shape = torch.Size([1, 11, 340, 340])
outputs[1].shape = torch.Size([1, 11, 170, 170])
outputs[2].shape = torch.Size([1, 11, 84, 84])
outputs[3].shape = torch.Size([1, 11, 43, 43])
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
num_outs
,
start_level
=
0
,
end_level
=-
1
,
out_ids
=
[],
add_extra_convs
=
False
,
relu_before_extra_convs
=
False
,
no_norm_on_lateral
=
False
,
conv_cfg
=
None
,
norm_cfg
=
None
,
act_cfg
=
None
,
upsample_cfg
=
dict
(
mode
=
'nearest'
),
init_cfg
=
dict
(
type
=
'Xavier'
,
layer
=
'Conv2d'
,
distribution
=
'uniform'
)):
super
(
CustomFPN
,
self
).
__init__
(
init_cfg
)
assert
isinstance
(
in_channels
,
list
)
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
num_ins
=
len
(
in_channels
)
self
.
num_outs
=
num_outs
self
.
relu_before_extra_convs
=
relu_before_extra_convs
self
.
no_norm_on_lateral
=
no_norm_on_lateral
self
.
fp16_enabled
=
False
self
.
upsample_cfg
=
upsample_cfg
.
copy
()
self
.
out_ids
=
out_ids
if
end_level
==
-
1
:
self
.
backbone_end_level
=
self
.
num_ins
# assert num_outs >= self.num_ins - start_level
else
:
# if end_level < inputs, no extra level is allowed
self
.
backbone_end_level
=
end_level
assert
end_level
<=
len
(
in_channels
)
assert
num_outs
==
end_level
-
start_level
self
.
start_level
=
start_level
self
.
end_level
=
end_level
self
.
add_extra_convs
=
add_extra_convs
assert
isinstance
(
add_extra_convs
,
(
str
,
bool
))
if
isinstance
(
add_extra_convs
,
str
):
# Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
assert
add_extra_convs
in
(
'on_input'
,
'on_lateral'
,
'on_output'
)
elif
add_extra_convs
:
# True
self
.
add_extra_convs
=
'on_input'
self
.
lateral_convs
=
nn
.
ModuleList
()
self
.
fpn_convs
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
start_level
,
self
.
backbone_end_level
):
l_conv
=
ConvModule
(
in_channels
[
i
],
out_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
if
not
self
.
no_norm_on_lateral
else
None
,
act_cfg
=
act_cfg
,
inplace
=
False
)
self
.
lateral_convs
.
append
(
l_conv
)
if
i
in
self
.
out_ids
:
fpn_conv
=
ConvModule
(
out_channels
,
out_channels
,
3
,
padding
=
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
inplace
=
False
)
self
.
fpn_convs
.
append
(
fpn_conv
)
# add extra conv layers (e.g., RetinaNet)
extra_levels
=
num_outs
-
self
.
backbone_end_level
+
self
.
start_level
if
self
.
add_extra_convs
and
extra_levels
>=
1
:
for
i
in
range
(
extra_levels
):
if
i
==
0
and
self
.
add_extra_convs
==
'on_input'
:
in_channels
=
self
.
in_channels
[
self
.
backbone_end_level
-
1
]
else
:
in_channels
=
out_channels
extra_fpn_conv
=
ConvModule
(
in_channels
,
out_channels
,
3
,
stride
=
2
,
padding
=
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
inplace
=
False
)
self
.
fpn_convs
.
append
(
extra_fpn_conv
)
def
forward
(
self
,
inputs
):
"""Forward function."""
assert
len
(
inputs
)
==
len
(
self
.
in_channels
)
# build laterals
laterals
=
[
lateral_conv
(
inputs
[
i
+
self
.
start_level
])
for
i
,
lateral_conv
in
enumerate
(
self
.
lateral_convs
)
]
# build top-down path
used_backbone_levels
=
len
(
laterals
)
for
i
in
range
(
used_backbone_levels
-
1
,
0
,
-
1
):
# In some cases, fixing `scale factor` (e.g. 2) is preferred, but
# it cannot co-exist with `size` in `F.interpolate`.
if
'scale_factor'
in
self
.
upsample_cfg
:
laterals
[
i
-
1
]
+=
F
.
interpolate
(
laterals
[
i
],
**
self
.
upsample_cfg
)
else
:
prev_shape
=
laterals
[
i
-
1
].
shape
[
2
:]
laterals
[
i
-
1
]
+=
F
.
interpolate
(
laterals
[
i
],
size
=
prev_shape
,
**
self
.
upsample_cfg
)
# build outputs
# part 1: from original levels
outs
=
[
self
.
fpn_convs
[
i
](
laterals
[
i
])
for
i
in
self
.
out_ids
]
# part 2: add extra levels
if
self
.
num_outs
>
len
(
outs
):
# use max pool to get more levels on top of outputs
# (e.g., Faster R-CNN, Mask R-CNN)
if
not
self
.
add_extra_convs
:
for
i
in
range
(
self
.
num_outs
-
used_backbone_levels
):
outs
.
append
(
F
.
max_pool2d
(
outs
[
-
1
],
1
,
stride
=
2
))
# add conv layers on top of original feature maps (RetinaNet)
else
:
if
self
.
add_extra_convs
==
'on_input'
:
extra_source
=
inputs
[
self
.
backbone_end_level
-
1
]
elif
self
.
add_extra_convs
==
'on_lateral'
:
extra_source
=
laterals
[
-
1
]
elif
self
.
add_extra_convs
==
'on_output'
:
extra_source
=
outs
[
-
1
]
else
:
raise
NotImplementedError
outs
.
append
(
self
.
fpn_convs
[
used_backbone_levels
](
extra_source
))
for
i
in
range
(
used_backbone_levels
+
1
,
self
.
num_outs
):
if
self
.
relu_before_extra_convs
:
outs
.
append
(
self
.
fpn_convs
[
i
](
F
.
relu
(
outs
[
-
1
])))
else
:
outs
.
append
(
self
.
fpn_convs
[
i
](
outs
[
-
1
]))
return
outs
[
0
]
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
0 → 100644
View file @
cce49ba9
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_ipm_view_transformer.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
copy
import
math
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.runner
import
BaseModule
from
mmdet3d.models
import
NECKS
def
get_campos
(
reference_points
,
ego2cam
,
img_shape
):
'''
Find the each refence point's corresponding pixel in each camera
Args:
reference_points: [B, num_query, 3]
ego2cam: (B, num_cam, 4, 4)
Outs:
reference_points_cam: (B*num_cam, num_query, 2)
mask: (B, num_cam, num_query)
num_query == W*H
'''
ego2cam
=
reference_points
.
new_tensor
(
ego2cam
)
# (B, N, 4, 4)
reference_points
=
reference_points
.
clone
()
B
,
num_query
=
reference_points
.
shape
[:
2
]
num_cam
=
ego2cam
.
shape
[
1
]
# reference_points (B, num_queries, 4)
reference_points
=
torch
.
cat
(
(
reference_points
,
torch
.
ones_like
(
reference_points
[...,
:
1
])),
-
1
)
reference_points
=
reference_points
.
view
(
B
,
1
,
num_query
,
4
).
repeat
(
1
,
num_cam
,
1
,
1
).
unsqueeze
(
-
1
)
ego2cam
=
ego2cam
.
view
(
B
,
num_cam
,
1
,
4
,
4
).
repeat
(
1
,
1
,
num_query
,
1
,
1
)
# reference_points_cam (B, num_cam, num_queries, 4)
reference_points_cam
=
(
ego2cam
@
reference_points
).
squeeze
(
-
1
)
eps
=
1e-9
mask
=
(
reference_points_cam
[...,
2
:
3
]
>
eps
)
reference_points_cam
=
\
reference_points_cam
[...,
0
:
2
]
/
\
reference_points_cam
[...,
2
:
3
]
+
eps
reference_points_cam
[...,
0
]
/=
img_shape
[
1
]
reference_points_cam
[...,
1
]
/=
img_shape
[
0
]
# from 0~1 to -1~1
reference_points_cam
=
(
reference_points_cam
-
0.5
)
*
2
mask
=
(
mask
&
(
reference_points_cam
[...,
0
:
1
]
>
-
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
<
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
>
-
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
<
1.0
))
# (B, num_cam, num_query)
mask
=
mask
.
view
(
B
,
num_cam
,
num_query
)
reference_points_cam
=
reference_points_cam
.
view
(
B
*
num_cam
,
num_query
,
2
)
return
reference_points_cam
,
mask
def
construct_plane_grid
(
xbound
,
ybound
,
height
:
float
,
dtype
=
torch
.
float32
):
'''
Returns:
plane: H, W, 3
'''
xmin
,
xmax
=
xbound
[
0
],
xbound
[
1
]
num_x
=
int
((
xbound
[
1
]
-
xbound
[
0
])
/
xbound
[
2
])
ymin
,
ymax
=
ybound
[
0
],
ybound
[
1
]
num_y
=
int
((
ybound
[
1
]
-
ybound
[
0
])
/
ybound
[
2
])
x
=
torch
.
linspace
(
xmin
,
xmax
,
num_x
,
dtype
=
dtype
)
y
=
torch
.
linspace
(
ymin
,
ymax
,
num_y
,
dtype
=
dtype
)
# [num_y, num_x]
y
,
x
=
torch
.
meshgrid
(
y
,
x
)
z
=
torch
.
ones_like
(
x
)
*
height
# [num_y, num_x, 3]
plane
=
torch
.
stack
([
x
,
y
,
z
],
dim
=-
1
)
return
plane
@
NECKS
.
register_module
()
class
CustomIPMViewTransformer
(
BaseModule
):
r
"""
Notes
-----
Adapted from https://github.com/Mrmoore98/VectorMapNet_code/blob/mian/plugin/models/backbones/ipm_backbone.py#L238.
"""
def
__init__
(
self
,
num_cam
,
xbound
,
ybound
,
zbound
,
out_channels
,
):
super
().
__init__
()
self
.
x_bound
=
xbound
self
.
y_bound
=
ybound
heights
=
[
zbound
[
0
]
+
i
*
zbound
[
2
]
for
i
in
range
(
int
((
zbound
[
1
]
-
zbound
[
0
])
//
zbound
[
2
])
+
1
)]
self
.
heights
=
heights
self
.
num_cam
=
num_cam
self
.
outconvs
=
\
nn
.
Conv2d
((
out_channels
+
3
)
*
len
(
heights
),
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
# bev_plane
bev_planes
=
[
construct_plane_grid
(
xbound
,
ybound
,
h
)
for
h
in
self
.
heights
]
self
.
register_buffer
(
'bev_planes'
,
torch
.
stack
(
bev_planes
),)
# nlvl,bH,bW,2
def
forward
(
self
,
cam_feat
,
ego2cam
,
img_shape
):
'''
inverse project
Args:
cam_feat: B*ncam, C, cH, cW
img_shape: tuple(H, W)
Returns:
project_feat: B, C, nlvl, bH, bW
bev_feat_mask: B, 1, nlvl, bH, bW
'''
B
=
ego2cam
.
shape
[
0
]
C
=
cam_feat
.
shape
[
1
]
bev_grid
=
self
.
bev_planes
.
unsqueeze
(
0
).
repeat
(
B
,
1
,
1
,
1
,
1
)
nlvl
,
bH
,
bW
=
bev_grid
.
shape
[
1
:
4
]
bev_grid
=
bev_grid
.
flatten
(
1
,
3
)
# B, nlvl*W*H, 3
# Find points in cam coords
# bev_grid_pos: B*ncam, nlvl*bH*bW, 2
bev_grid_pos
,
bev_cam_mask
=
get_campos
(
bev_grid
,
ego2cam
,
img_shape
)
# B*cam, nlvl*bH, bW, 2
bev_grid_pos
=
bev_grid_pos
.
unflatten
(
-
2
,
(
nlvl
*
bH
,
bW
))
# project feat from 2D to bev plane
projected_feature
=
F
.
grid_sample
(
cam_feat
,
bev_grid_pos
).
view
(
B
,
-
1
,
C
,
nlvl
,
bH
,
bW
)
# B,cam,C,nlvl,bH,bW
# B,cam,nlvl,bH,bW
bev_feat_mask
=
bev_cam_mask
.
unflatten
(
-
1
,
(
nlvl
,
bH
,
bW
))
# eliminate the ncam
# The bev feature is the sum of the 6 cameras
bev_feat_mask
=
bev_feat_mask
.
unsqueeze
(
2
)
projected_feature
=
(
projected_feature
*
bev_feat_mask
).
sum
(
1
)
num_feat
=
bev_feat_mask
.
sum
(
1
)
projected_feature
=
projected_feature
/
\
num_feat
.
masked_fill
(
num_feat
==
0
,
1
)
# concatenate a position information
# projected_feature: B, bH, bW, nlvl, C+3
bev_grid
=
bev_grid
.
view
(
B
,
nlvl
,
bH
,
bW
,
3
).
permute
(
0
,
4
,
1
,
2
,
3
)
projected_feature
=
torch
.
cat
(
(
projected_feature
,
bev_grid
),
dim
=
1
)
bev_feat
,
bev_feat_mask
=
projected_feature
,
bev_feat_mask
.
sum
(
1
)
>
0
# multi level into a same
bev_feat
=
bev_feat
.
flatten
(
1
,
2
)
bev_feat
=
self
.
outconvs
(
bev_feat
)
return
bev_feat
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
0 → 100644
View file @
cce49ba9
custom_imports
=
dict
(
imports
=
[
'projects.openlanev2.baseline'
])
method_para
=
dict
(
n_control
=
5
)
# #point for each curve
_dim_
=
128
model
=
dict
(
type
=
'Baseline'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
18
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
'torchvision://resnet18'
)),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
_dim_
*
2
,
_dim_
*
4
],
out_channels
=
_dim_
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'CustomIPMViewTransformer'
,
num_cam
=
7
,
xbound
=
[
-
50.0
,
50.0
,
1.0
],
ybound
=
[
-
25.0
,
25.0
,
1.0
],
zbound
=
[
-
3.0
,
2.0
,
0.5
],
out_channels
=
_dim_
),
lc_head
=
dict
(
type
=
'CustomDETRHead'
,
num_classes
=
1
,
in_channels
=
_dim_
,
num_query
=
50
,
object_type
=
'lane'
,
num_layers
=
1
,
num_reg_dim
=
method_para
[
'n_control'
]
*
3
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
# dummy
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'LaneHungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'LaneL1Cost'
,
weight
=
2.5
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
))),
# dummy
bev_range
=
[
-
50.0
,
-
25.0
,
-
3.0
,
50.0
,
25.0
,
2.0
]),
te_head
=
dict
(
type
=
'CustomDETRHead'
,
num_classes
=
13
,
in_channels
=
_dim_
,
num_query
=
30
,
object_type
=
'bbox'
,
num_layers
=
1
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.0
),
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
2.5
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
1.0
)))),
lclc_head
=
dict
(
type
=
'TopologyHead'
,
in_channels
=
128
,
hidden_channels
=
_dim_
,
out_channels
=
1
,
num_layers
=
3
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
)),
lcte_head
=
dict
(
type
=
'TopologyHead'
,
in_channels
=
128
,
hidden_channels
=
_dim_
,
out_channels
=
1
,
num_layers
=
3
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
)))
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomParameterizeLane'
,
method
=
'bezier_Endpointfixed'
,
method_para
=
method_para
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_lc'
,
'gt_lc_labels'
,
'gt_te'
,
'gt_te_labels'
,
'gt_topology_lclc'
,
'gt_topology_lcte'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
dataset_type
=
'OpenLaneV2SubsetADataset'
data_root
=
'OpenLane-V2/data/OpenLane-V2'
meta_root
=
'OpenLane-V2/data/OpenLane-V2'
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_train'
,
pipeline
=
train_pipeline
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-4
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
20
)
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
1
)
# yapf:disable
log_config
=
dict
(
interval
=
10
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
# yapf:enable
dist_params
=
dict
(
backend
=
'nccl'
)
log_level
=
'INFO'
load_from
=
None
resume_from
=
None
workflow
=
[(
'train'
,
1
)]
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
0 → 100644
View file @
cce49ba9
custom_imports
=
dict
(
imports
=
[
'plugin.mmdet3d.baseline'
])
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
25.6
,
-
2.3
,
51.2
,
25.6
,
1.7
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
class_names
=
[
'centerline'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
num_cams
=
7
Map_size
=
[(
-
50
,
50
),
(
-
25
,
25
)]
method_para
=
dict
(
n_control
=
5
)
# #point for each curve
code_size
=
3
*
method_para
[
'n_control'
]
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_ffn_cfg_
=
dict
(
type
=
'FFN'
,
embed_dims
=
_dim_
,
feedforward_channels
=
_ffn_dim_
,
num_fcs
=
2
,
ffn_drop
=
0.1
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
_num_levels_
=
4
bev_h_
=
100
bev_w_
=
200
model
=
dict
(
type
=
'ROAD_BEVFormer'
,
video_test_mode
=
False
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
,
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
'torchvision://resnet50'
)),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
bev_constructor
=
dict
(
type
=
'BEVFormerConstructer'
,
num_feature_levels
=
_num_levels_
,
num_cams
=
num_cams
,
embed_dims
=
_dim_
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
pc_range
=
point_cloud_range
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
rotate_center
=
[
bev_h_
//
2
,
bev_w_
//
2
],
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
embed_dims
=
_dim_
,
num_cams
=
num_cams
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
)
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
),
bbox_head
=
dict
(
type
=
'TEDeformableDETRHead'
,
num_query
=
100
,
num_classes
=
13
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'DeformableDetrTransformer'
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
),
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DeformableDetrTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
_pos_dim_
,
normalize
=
True
,
offset
=-
0.5
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.0
),
test_cfg
=
dict
(
max_per_img
=
50
)),
pts_bbox_head
=
dict
(
type
=
'LCDeformableDETRHead'
,
num_classes
=
1
,
in_channels
=
_dim_
,
num_query
=
100
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
sync_cls_avg_factor
=
False
,
with_box_refine
=
False
,
with_shared_param
=
False
,
code_size
=
code_size
,
code_weights
=
[
1.0
for
i
in
range
(
code_size
)],
pc_range
=
point_cloud_range
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
embed_dims
=
_dim_
,
decoder
=
dict
(
type
=
'LaneDetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.5
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0075
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
lclc_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
lcte_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
# model training and testing settings
bbox_train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
2.5
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
1.0
))),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'LaneHungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.5
),
reg_cost
=
dict
(
type
=
'LaneL1Cost'
,
weight
=
0.0075
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
))))
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomParameterizeLane'
,
method
=
'bezier_Endpointfixed'
,
method_para
=
method_para
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_lc'
,
'gt_lc_labels'
,
'gt_te'
,
'gt_te_labels'
,
'gt_topology_lclc'
,
'gt_topology_lcte'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
dataset_type
=
'OpenLaneV2SubsetADataset'
data_root
=
'data/OpenLane-V2'
meta_root
=
'data/OpenLane-V2'
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_train'
,
pipeline
=
train_pipeline
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
1
)
dist_params
=
dict
(
backend
=
'nccl'
)
log_level
=
'INFO'
load_from
=
None
resume_from
=
None
workflow
=
[(
'train'
,
1
)]
\ No newline at end of file
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
0 → 100644
View file @
cce49ba9
custom_imports
=
dict
(
imports
=
[
'plugin.mmdet3d.baseline'
])
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
25.6
,
-
2.3
,
51.2
,
25.6
,
1.7
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
class_names
=
[
'centerline'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
num_cams
=
7
Map_size
=
[(
-
50
,
50
),
(
-
25
,
25
)]
method_para
=
dict
(
n_control
=
5
)
# #point for each curve
code_size
=
3
*
method_para
[
'n_control'
]
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_ffn_cfg_
=
dict
(
type
=
'FFN'
,
embed_dims
=
_dim_
,
feedforward_channels
=
_ffn_dim_
,
num_fcs
=
2
,
ffn_drop
=
0.1
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
_num_levels_
=
4
bev_h_
=
100
bev_w_
=
200
pretrained
=
'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_s_1k_224.pth'
model
=
dict
(
type
=
'ROAD_BEVFormer'
,
video_test_mode
=
False
,
img_backbone
=
dict
(
type
=
'InternImage'
,
core_op
=
'DCNv3'
,
channels
=
80
,
depths
=
[
4
,
4
,
21
,
4
],
groups
=
[
5
,
10
,
20
,
40
],
mlp_ratio
=
4.
,
drop_path_rate
=
0.3
,
norm_layer
=
'LN'
,
layer_scale
=
1.0
,
offset_scale
=
1.0
,
post_norm
=
True
,
with_cp
=
False
,
out_indices
=
(
0
,
1
,
2
,
3
),
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
pretrained
)),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
80
,
160
,
320
,
640
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
bev_constructor
=
dict
(
type
=
'BEVFormerConstructer'
,
num_feature_levels
=
_num_levels_
,
num_cams
=
num_cams
,
embed_dims
=
_dim_
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
pc_range
=
point_cloud_range
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
rotate_center
=
[
bev_h_
//
2
,
bev_w_
//
2
],
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
embed_dims
=
_dim_
,
num_cams
=
num_cams
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
)
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
),
bbox_head
=
dict
(
type
=
'TEDeformableDETRHead'
,
num_query
=
100
,
num_classes
=
13
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'DeformableDetrTransformer'
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
),
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DeformableDetrTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
_pos_dim_
,
normalize
=
True
,
offset
=-
0.5
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.0
),
test_cfg
=
dict
(
max_per_img
=
50
)),
pts_bbox_head
=
dict
(
type
=
'LCDeformableDETRHead'
,
num_classes
=
1
,
in_channels
=
_dim_
,
num_query
=
100
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
sync_cls_avg_factor
=
False
,
with_box_refine
=
False
,
with_shared_param
=
False
,
code_size
=
code_size
,
code_weights
=
[
1.0
for
i
in
range
(
code_size
)],
pc_range
=
point_cloud_range
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
embed_dims
=
_dim_
,
decoder
=
dict
(
type
=
'LaneDetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.5
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0075
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
lclc_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
lcte_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
# model training and testing settings
bbox_train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
2.5
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
1.0
))),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'LaneHungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.5
),
reg_cost
=
dict
(
type
=
'LaneL1Cost'
,
weight
=
0.0075
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
))))
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomParameterizeLane'
,
method
=
'bezier_Endpointfixed'
,
method_para
=
method_para
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_lc'
,
'gt_lc_labels'
,
'gt_te'
,
'gt_te_labels'
,
'gt_topology_lclc'
,
'gt_topology_lcte'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
dataset_type
=
'OpenLaneV2SubsetADataset'
data_root
=
'data/OpenLane-V2'
meta_root
=
'data/OpenLane-V2'
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_train'
,
pipeline
=
train_pipeline
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
1
)
dist_params
=
dict
(
backend
=
'nccl'
)
log_level
=
'INFO'
load_from
=
None
resume_from
=
None
workflow
=
[(
'train'
,
1
)]
\ No newline at end of file
autonomous_driving/openlane-v2/requirements.txt
0 → 100644
View file @
cce49ba9
tqdm
ninja
jupyter
openmim
matplotlib
numpy >=1.22.0, <1.24.0
scikit-learn
similaritymeasures
opencv-python
scipy ==1.8.0
ortools ==9.2.9972
iso3166
chardet
autonomous_driving/openlane-v2/setup.py
0 → 100644
View file @
cce49ba9
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# setup.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from
setuptools
import
setup
,
find_packages
setup
(
name
=
'openlanev2'
,
version
=
'0.1.0'
,
author
=
'The OpenLane-V2 Dataset Authors'
,
author_email
=
'wanghuijie@pjlab.org.cn'
,
description
=
'The official devkit of the OpenLane-V2 dataset.'
,
url
=
'https://github.com/OpenDriveLab/OpenLane-V2'
,
packages
=
find_packages
(),
license
=
'Apache License 2.0'
,
)
autonomous_driving/openlane-v2/tutorial.ipynb
0 → 100644
View file @
cce49ba9
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment