Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
80e8c1d3
Commit
80e8c1d3
authored
Apr 21, 2023
by
Charlie W
Browse files
Add openlane v2
parent
dbf29e61
Changes
93
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
2936 additions
and
0 deletions
+2936
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/multi_scale_deformable_attn_function.py
...ne/models/modules/multi_scale_deformable_attn_function.py
+163
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
...mdet3d/baseline/models/modules/spatial_cross_attention.py
+398
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
...mdet3d/baseline/models/modules/temporal_self_attention.py
+275
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
...-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
+139
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
...nlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
+2
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
...ane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
+224
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
...et3d/baseline/models/necks/custom_ipm_view_transformer.py
+195
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
...us_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
+210
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
...ving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
+353
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
...iving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
+360
-0
autonomous_driving/openlane-v2/requirements.txt
autonomous_driving/openlane-v2/requirements.txt
+13
-0
autonomous_driving/openlane-v2/setup.py
autonomous_driving/openlane-v2/setup.py
+35
-0
autonomous_driving/openlane-v2/tutorial.ipynb
autonomous_driving/openlane-v2/tutorial.ipynb
+569
-0
No files found.
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/multi_scale_deformable_attn_function.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
torch
from
torch.cuda.amp
import
custom_bwd
,
custom_fwd
from
torch.autograd.function
import
Function
,
once_differentiable
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
class
MultiScaleDeformableAttnFunction_fp16
(
Function
):
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
class
MultiScaleDeformableAttnFunction_fp32
(
Function
):
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float32
)
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
mmcv.ops.multi_scale_deform_attn
import
multi_scale_deformable_attn_pytorch
import
warnings
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
xavier_init
,
constant_init
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
from
mmcv.cnn.bricks.transformer
import
build_attention
import
math
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
ext_loader
from
.multi_scale_deformable_attn_function
import
MultiScaleDeformableAttnFunction_fp32
,
\
MultiScaleDeformableAttnFunction_fp16
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
@
ATTENTION
.
register_module
()
class
SpatialCrossAttention
(
BaseModule
):
"""An attention module used in BEVFormer.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_cams (int): The number of cameras
dropout (float): A Dropout layer on `inp_residual`.
Default: 0..
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
deformable_attention: (dict): The config for the deformable attention used in SCA.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_cams
=
6
,
pc_range
=
None
,
dropout
=
0.1
,
init_cfg
=
None
,
batch_first
=
False
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
256
,
num_levels
=
4
),
**
kwargs
):
super
(
SpatialCrossAttention
,
self
).
__init__
(
init_cfg
)
self
.
init_cfg
=
init_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
pc_range
=
pc_range
self
.
fp16_enabled
=
False
self
.
deformable_attention
=
build_attention
(
deformable_attention
)
self
.
embed_dims
=
embed_dims
self
.
num_cams
=
num_cams
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
batch_first
=
batch_first
self
.
init_weight
()
def
init_weight
(
self
):
"""Default initialization for Parameters of Module."""
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
@
force_fp32
(
apply_to
=
(
'query'
,
'key'
,
'value'
,
'query_pos'
,
'reference_points_cam'
))
def
forward
(
self
,
query
,
key
,
value
,
residual
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
reference_points_cam
=
None
,
bev_mask
=
None
,
level_start_index
=
None
,
flag
=
'encoder'
,
**
kwargs
):
"""Forward Function of Detr3DCrossAtten.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`. (B, N, C, H, W)
residual (Tensor): The tensor used for addition, with the
same shape as `x`. Default None. If None, `x` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, 4),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different level. With shape (num_levels, 2),
last dimension represent (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape (num_levels) and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
key
is
None
:
key
=
query
if
value
is
None
:
value
=
key
if
residual
is
None
:
inp_residual
=
query
slots
=
torch
.
zeros_like
(
query
)
if
query_pos
is
not
None
:
query
=
query
+
query_pos
bs
,
num_query
,
_
=
query
.
size
()
D
=
reference_points_cam
.
size
(
3
)
indexes
=
[]
for
i
,
mask_per_img
in
enumerate
(
bev_mask
):
index_query_per_img
=
mask_per_img
[
0
].
sum
(
-
1
).
nonzero
().
squeeze
(
-
1
)
indexes
.
append
(
index_query_per_img
)
max_len
=
max
([
len
(
each
)
for
each
in
indexes
])
# each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory.
queries_rebatch
=
query
.
new_zeros
(
[
bs
,
self
.
num_cams
,
max_len
,
self
.
embed_dims
])
reference_points_rebatch
=
reference_points_cam
.
new_zeros
(
[
bs
,
self
.
num_cams
,
max_len
,
D
,
2
])
for
j
in
range
(
bs
):
for
i
,
reference_points_per_img
in
enumerate
(
reference_points_cam
):
index_query_per_img
=
indexes
[
i
]
queries_rebatch
[
j
,
i
,
:
len
(
index_query_per_img
)]
=
query
[
j
,
index_query_per_img
]
reference_points_rebatch
[
j
,
i
,
:
len
(
index_query_per_img
)]
=
reference_points_per_img
[
j
,
index_query_per_img
]
num_cams
,
l
,
bs
,
embed_dims
=
key
.
shape
key
=
key
.
permute
(
2
,
0
,
1
,
3
).
reshape
(
bs
*
self
.
num_cams
,
l
,
self
.
embed_dims
)
value
=
value
.
permute
(
2
,
0
,
1
,
3
).
reshape
(
bs
*
self
.
num_cams
,
l
,
self
.
embed_dims
)
queries
=
self
.
deformable_attention
(
query
=
queries_rebatch
.
view
(
bs
*
self
.
num_cams
,
max_len
,
self
.
embed_dims
),
key
=
key
,
value
=
value
,
reference_points
=
reference_points_rebatch
.
view
(
bs
*
self
.
num_cams
,
max_len
,
D
,
2
),
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
).
view
(
bs
,
self
.
num_cams
,
max_len
,
self
.
embed_dims
)
for
j
in
range
(
bs
):
for
i
,
index_query_per_img
in
enumerate
(
indexes
):
slots
[
j
,
index_query_per_img
]
+=
queries
[
j
,
i
,
:
len
(
index_query_per_img
)]
count
=
bev_mask
.
sum
(
-
1
)
>
0
count
=
count
.
permute
(
1
,
2
,
0
).
sum
(
-
1
)
count
=
torch
.
clamp
(
count
,
min
=
1.0
)
slots
=
slots
/
count
[...,
None
]
slots
=
self
.
output_proj
(
slots
)
return
self
.
dropout
(
slots
)
+
inp_residual
@
ATTENTION
.
register_module
()
class
MSDeformableAttention3D
(
BaseModule
):
"""An attention module used in BEVFormer based on Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
8
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
True
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
batch_first
=
batch_first
self
.
output_proj
=
None
self
.
fp16_enabled
=
False
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
( bs, num_query, embed_dims).
key (Tensor): The key tensor with shape
`(bs, num_key, embed_dims)`.
value (Tensor): The value tensor with shape
`(bs, num_key, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
value
=
query
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
_
=
query
.
shape
bs
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
view
(
bs
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
)
if
reference_points
.
shape
[
-
1
]
==
2
:
"""
For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
For each referent point, we sample `num_points` sampling points.
For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points.
"""
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
bs
,
num_query
,
num_Z_anchors
,
xy
=
reference_points
.
shape
reference_points
=
reference_points
[:,
:,
None
,
None
,
None
,
:,
:]
sampling_offsets
=
sampling_offsets
/
\
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
bs
,
num_query
,
num_heads
,
num_levels
,
num_all_points
,
xy
=
sampling_offsets
.
shape
sampling_offsets
=
sampling_offsets
.
view
(
bs
,
num_query
,
num_heads
,
num_levels
,
num_all_points
//
num_Z_anchors
,
num_Z_anchors
,
xy
)
sampling_locations
=
reference_points
+
sampling_offsets
bs
,
num_query
,
num_heads
,
num_levels
,
num_points
,
num_Z_anchors
,
xy
=
sampling_locations
.
shape
assert
num_all_points
==
num_points
*
num_Z_anchors
sampling_locations
=
sampling_locations
.
view
(
bs
,
num_query
,
num_heads
,
num_levels
,
num_all_points
,
xy
)
elif
reference_points
.
shape
[
-
1
]
==
4
:
assert
False
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
# sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
# attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
#
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
if
value
.
dtype
==
torch
.
float16
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
else
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
if
not
self
.
batch_first
:
output
=
output
.
permute
(
1
,
0
,
2
)
return
output
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.multi_scale_deformable_attn_function
import
MultiScaleDeformableAttnFunction_fp32
from
mmcv.ops.multi_scale_deform_attn
import
multi_scale_deformable_attn_pytorch
import
warnings
import
torch
import
torch.nn
as
nn
from
mmcv.cnn
import
xavier_init
,
constant_init
from
mmcv.cnn.bricks.registry
import
ATTENTION
import
math
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
(
ConfigDict
,
build_from_cfg
,
deprecated_api_warning
,
to_2tuple
)
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
@
ATTENTION
.
register_module
()
class
TemporalSelfAttention
(
BaseModule
):
"""An attention module used in BEVFormer based on Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to True.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
the length of BEV queue is 2.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
4
,
num_bev_queue
=
2
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
True
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
batch_first
=
batch_first
self
.
fp16_enabled
=
False
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
num_bev_queue
=
num_bev_queue
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
*
self
.
num_bev_queue
,
num_bev_queue
*
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
*
self
.
num_bev_queue
,
num_bev_queue
*
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
*
self
.
num_bev_queue
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
flag
=
'decoder'
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
assert
self
.
batch_first
bs
,
len_bev
,
c
=
query
.
shape
value
=
torch
.
stack
([
query
,
query
],
1
).
reshape
(
bs
*
2
,
len_bev
,
c
)
# value = torch.cat([query, query], 0)
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
embed_dims
=
query
.
shape
_
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
assert
self
.
num_bev_queue
==
2
query
=
torch
.
cat
([
value
[::
2
],
query
],
-
1
)
value_
=
value
.
clone
()
value_
[:
bs
]
=
value
[::
2
]
value_
[
bs
:]
=
value
[
1
::
2
]
value
=
self
.
value_proj
(
value
)
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
reshape
(
bs
*
self
.
num_bev_queue
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
)
sampling_offsets
=
sampling_offsets
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_bev_queue
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_bev_queue
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_bev_queue
,
self
.
num_levels
,
self
.
num_points
)
attention_weights
=
attention_weights
.
permute
(
0
,
3
,
1
,
2
,
4
,
5
)
\
.
reshape
(
bs
*
self
.
num_bev_queue
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
).
contiguous
()
sampling_offsets
=
sampling_offsets
.
permute
(
0
,
3
,
1
,
2
,
4
,
5
,
6
)
\
.
reshape
(
bs
*
self
.
num_bev_queue
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
if
reference_points
.
shape
[
-
1
]
==
2
:
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:]
\
+
sampling_offsets
\
/
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
elif
reference_points
.
shape
[
-
1
]
==
4
:
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:
2
]
\
+
sampling_offsets
/
self
.
num_points
\
*
reference_points
[:,
:,
None
,
:,
None
,
2
:]
\
*
0.5
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
# using fp16 deformable attention is unstable because it performs many sum operations
if
value
.
dtype
==
torch
.
float16
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
else
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
# output shape (bs*num_bev_queue, num_query, embed_dims)
# (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
output
=
output
.
permute
(
1
,
2
,
0
)
# fuse history value and current value
# (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
output
=
output
.
view
(
num_query
,
embed_dims
,
bs
,
self
.
num_bev_queue
)
output
=
output
.
mean
(
-
1
)
# (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
output
=
output
.
permute
(
2
,
0
,
1
)
output
=
self
.
output_proj
(
output
)
if
not
self
.
batch_first
:
output
=
output
.
permute
(
1
,
0
,
2
)
return
self
.
dropout
(
output
)
+
identity
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
mmcv.cnn
import
xavier_init
from
mmcv.cnn.bricks.transformer
import
build_transformer_layer_sequence
from
mmcv.runner.base_module
import
BaseModule
from
mmdet.models.utils.builder
import
TRANSFORMER
from
torch.nn.init
import
normal_
from
mmcv.runner.base_module
import
BaseModule
from
torchvision.transforms.functional
import
rotate
from
.temporal_self_attention
import
TemporalSelfAttention
from
.spatial_cross_attention
import
MSDeformableAttention3D
from
.decoder
import
CustomMSDeformableAttention
from
mmcv.runner
import
force_fp32
,
auto_fp16
import
pdb
@
TRANSFORMER
.
register_module
()
class
PerceptionTransformer
(
BaseModule
):
"""Implements the Detr3D transformer.
Args:
as_two_stage (bool): Generate query from encoder features.
Default: False.
num_feature_levels (int): Number of feature maps from FPN:
Default: 4.
two_stage_num_proposals (int): Number of proposals when set
`as_two_stage` as True. Default: 300.
"""
def
__init__
(
self
,
decoder
=
None
,
embed_dims
=
256
,
**
kwargs
):
super
(
PerceptionTransformer
,
self
).
__init__
(
**
kwargs
)
self
.
decoder
=
build_transformer_layer_sequence
(
decoder
)
self
.
embed_dims
=
embed_dims
self
.
fp16_enabled
=
False
self
.
init_layers
()
def
init_layers
(
self
):
"""Initialize layers of the Detr3DTransformer."""
self
.
reference_points
=
nn
.
Linear
(
self
.
embed_dims
,
3
)
def
init_weights
(
self
):
"""Initialize the transformer weights."""
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
MSDeformableAttention3D
)
or
isinstance
(
m
,
TemporalSelfAttention
)
\
or
isinstance
(
m
,
CustomMSDeformableAttention
):
try
:
m
.
init_weight
()
except
AttributeError
:
m
.
init_weights
()
xavier_init
(
self
.
reference_points
,
distribution
=
'uniform'
,
bias
=
0.
)
@
auto_fp16
(
apply_to
=
(
'mlvl_feats'
,
'bev_queries'
,
'object_query_embed'
,
'prev_bev'
,
'bev_pos'
))
def
forward
(
self
,
mlvl_feats
,
bev_embed
,
object_query_embed
,
bev_h
,
bev_w
,
reg_branches
=
None
,
cls_branches
=
None
,
**
kwargs
):
"""Forward function for `Detr3DTransformer`.
Args:
mlvl_feats (list(Tensor)): Input queries from
different level. Each element has shape
[bs, num_cams, embed_dims, h, w].
bev_queries (Tensor): (bev_h*bev_w, c)
bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
object_query_embed (Tensor): The query embedding for decoder,
with shape [num_query, c].
reg_branches (obj:`nn.ModuleList`): Regression heads for
feature maps from each decoder layer. Only would
be passed when `with_box_refine` is True. Default to None.
Returns:
tuple[Tensor]: results of decoder containing the following tensor.
- bev_embed: BEV features
- inter_states: Outputs from decoder. If
return_intermediate_dec is True output has shape \
(num_dec_layers, bs, num_query, embed_dims), else has \
shape (1, bs, num_query, embed_dims).
- init_reference_out: The initial value of reference \
points, has shape (bs, num_queries, 4).
- inter_references_out: The internal value of reference \
points in decoder, has shape \
(num_dec_layers, bs,num_query, embed_dims)
- enc_outputs_class: The classification score of \
proposals generated from \
encoder's feature maps, has shape \
(batch, h*w, num_classes). \
Only would be returned when `as_two_stage` is True, \
otherwise None.
- enc_outputs_coord_unact: The regression results \
generated from encoder's feature maps., has shape \
(batch, h*w, 4). Only would \
be returned when `as_two_stage` is True, \
otherwise None.
"""
bs
=
mlvl_feats
[
0
].
size
(
0
)
query_pos
,
query
=
torch
.
split
(
object_query_embed
,
self
.
embed_dims
,
dim
=
1
)
query_pos
=
query_pos
.
unsqueeze
(
0
).
expand
(
bs
,
-
1
,
-
1
)
query
=
query
.
unsqueeze
(
0
).
expand
(
bs
,
-
1
,
-
1
)
reference_points
=
self
.
reference_points
(
query_pos
)
reference_points
=
reference_points
.
sigmoid
()
init_reference_out
=
reference_points
query
=
query
.
permute
(
1
,
0
,
2
)
query_pos
=
query_pos
.
permute
(
1
,
0
,
2
)
bev_embed
=
bev_embed
.
permute
(
1
,
0
,
2
)
inter_states
,
inter_references
=
self
.
decoder
(
query
=
query
,
key
=
None
,
value
=
bev_embed
,
query_pos
=
query_pos
,
reference_points
=
reference_points
,
reg_branches
=
reg_branches
,
cls_branches
=
cls_branches
,
spatial_shapes
=
torch
.
tensor
([[
bev_h
,
bev_w
]],
device
=
query
.
device
),
level_start_index
=
torch
.
tensor
([
0
],
device
=
query
.
device
),
**
kwargs
)
inter_references_out
=
inter_references
return
inter_states
,
init_reference_out
,
inter_references_out
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
0 → 100644
View file @
80e8c1d3
from
.custom_fpn
import
*
from
.custom_ipm_view_transformer
import
*
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_fpn.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
ConvModule
from
mmcv.runner
import
BaseModule
from
mmdet3d.models
import
NECKS
@
NECKS
.
register_module
()
class
CustomFPN
(
BaseModule
):
r
"""
Notes
-----
Adapted from https://github.com/HuangJunJie2017/BEVDet/blob/dev2.0/mmdet3d/models/necks/fpn.py#L11.
Feature Pyramid Network.
This is an implementation of paper `Feature Pyramid Networks for Object
Detection <https://arxiv.org/abs/1612.03144>`_.
Args:
in_channels (List[int]): Number of input channels per scale.
out_channels (int): Number of output channels (used at each scale)
num_outs (int): Number of output scales.
start_level (int): Index of the start input backbone level used to
build the feature pyramid. Default: 0.
end_level (int): Index of the end input backbone level (exclusive) to
build the feature pyramid. Default: -1, which means the last level.
add_extra_convs (bool | str): If bool, it decides whether to add conv
layers on top of the original feature maps. Default to False.
If True, it is equivalent to `add_extra_convs='on_input'`.
If str, it specifies the source feature map of the extra convs.
Only the following options are allowed
- 'on_input': Last feat map of neck inputs (i.e. backbone feature).
- 'on_lateral': Last feature map after lateral convs.
- 'on_output': The last output feature map after fpn convs.
relu_before_extra_convs (bool): Whether to apply relu before the extra
conv. Default: False.
no_norm_on_lateral (bool): Whether to apply norm on lateral.
Default: False.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
act_cfg (str): Config dict for activation layer in ConvModule.
Default: None.
upsample_cfg (dict): Config dict for interpolate layer.
Default: `dict(mode='nearest')`
init_cfg (dict or list[dict], optional): Initialization config dict.
Example:
>>> import torch
>>> in_channels = [2, 3, 5, 7]
>>> scales = [340, 170, 84, 43]
>>> inputs = [torch.rand(1, c, s, s)
... for c, s in zip(in_channels, scales)]
>>> self = FPN(in_channels, 11, len(in_channels)).eval()
>>> outputs = self.forward(inputs)
>>> for i in range(len(outputs)):
... print(f'outputs[{i}].shape = {outputs[i].shape}')
outputs[0].shape = torch.Size([1, 11, 340, 340])
outputs[1].shape = torch.Size([1, 11, 170, 170])
outputs[2].shape = torch.Size([1, 11, 84, 84])
outputs[3].shape = torch.Size([1, 11, 43, 43])
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
num_outs
,
start_level
=
0
,
end_level
=-
1
,
out_ids
=
[],
add_extra_convs
=
False
,
relu_before_extra_convs
=
False
,
no_norm_on_lateral
=
False
,
conv_cfg
=
None
,
norm_cfg
=
None
,
act_cfg
=
None
,
upsample_cfg
=
dict
(
mode
=
'nearest'
),
init_cfg
=
dict
(
type
=
'Xavier'
,
layer
=
'Conv2d'
,
distribution
=
'uniform'
)):
super
(
CustomFPN
,
self
).
__init__
(
init_cfg
)
assert
isinstance
(
in_channels
,
list
)
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
num_ins
=
len
(
in_channels
)
self
.
num_outs
=
num_outs
self
.
relu_before_extra_convs
=
relu_before_extra_convs
self
.
no_norm_on_lateral
=
no_norm_on_lateral
self
.
fp16_enabled
=
False
self
.
upsample_cfg
=
upsample_cfg
.
copy
()
self
.
out_ids
=
out_ids
if
end_level
==
-
1
:
self
.
backbone_end_level
=
self
.
num_ins
# assert num_outs >= self.num_ins - start_level
else
:
# if end_level < inputs, no extra level is allowed
self
.
backbone_end_level
=
end_level
assert
end_level
<=
len
(
in_channels
)
assert
num_outs
==
end_level
-
start_level
self
.
start_level
=
start_level
self
.
end_level
=
end_level
self
.
add_extra_convs
=
add_extra_convs
assert
isinstance
(
add_extra_convs
,
(
str
,
bool
))
if
isinstance
(
add_extra_convs
,
str
):
# Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
assert
add_extra_convs
in
(
'on_input'
,
'on_lateral'
,
'on_output'
)
elif
add_extra_convs
:
# True
self
.
add_extra_convs
=
'on_input'
self
.
lateral_convs
=
nn
.
ModuleList
()
self
.
fpn_convs
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
start_level
,
self
.
backbone_end_level
):
l_conv
=
ConvModule
(
in_channels
[
i
],
out_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
if
not
self
.
no_norm_on_lateral
else
None
,
act_cfg
=
act_cfg
,
inplace
=
False
)
self
.
lateral_convs
.
append
(
l_conv
)
if
i
in
self
.
out_ids
:
fpn_conv
=
ConvModule
(
out_channels
,
out_channels
,
3
,
padding
=
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
inplace
=
False
)
self
.
fpn_convs
.
append
(
fpn_conv
)
# add extra conv layers (e.g., RetinaNet)
extra_levels
=
num_outs
-
self
.
backbone_end_level
+
self
.
start_level
if
self
.
add_extra_convs
and
extra_levels
>=
1
:
for
i
in
range
(
extra_levels
):
if
i
==
0
and
self
.
add_extra_convs
==
'on_input'
:
in_channels
=
self
.
in_channels
[
self
.
backbone_end_level
-
1
]
else
:
in_channels
=
out_channels
extra_fpn_conv
=
ConvModule
(
in_channels
,
out_channels
,
3
,
stride
=
2
,
padding
=
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
inplace
=
False
)
self
.
fpn_convs
.
append
(
extra_fpn_conv
)
def
forward
(
self
,
inputs
):
"""Forward function."""
assert
len
(
inputs
)
==
len
(
self
.
in_channels
)
# build laterals
laterals
=
[
lateral_conv
(
inputs
[
i
+
self
.
start_level
])
for
i
,
lateral_conv
in
enumerate
(
self
.
lateral_convs
)
]
# build top-down path
used_backbone_levels
=
len
(
laterals
)
for
i
in
range
(
used_backbone_levels
-
1
,
0
,
-
1
):
# In some cases, fixing `scale factor` (e.g. 2) is preferred, but
# it cannot co-exist with `size` in `F.interpolate`.
if
'scale_factor'
in
self
.
upsample_cfg
:
laterals
[
i
-
1
]
+=
F
.
interpolate
(
laterals
[
i
],
**
self
.
upsample_cfg
)
else
:
prev_shape
=
laterals
[
i
-
1
].
shape
[
2
:]
laterals
[
i
-
1
]
+=
F
.
interpolate
(
laterals
[
i
],
size
=
prev_shape
,
**
self
.
upsample_cfg
)
# build outputs
# part 1: from original levels
outs
=
[
self
.
fpn_convs
[
i
](
laterals
[
i
])
for
i
in
self
.
out_ids
]
# part 2: add extra levels
if
self
.
num_outs
>
len
(
outs
):
# use max pool to get more levels on top of outputs
# (e.g., Faster R-CNN, Mask R-CNN)
if
not
self
.
add_extra_convs
:
for
i
in
range
(
self
.
num_outs
-
used_backbone_levels
):
outs
.
append
(
F
.
max_pool2d
(
outs
[
-
1
],
1
,
stride
=
2
))
# add conv layers on top of original feature maps (RetinaNet)
else
:
if
self
.
add_extra_convs
==
'on_input'
:
extra_source
=
inputs
[
self
.
backbone_end_level
-
1
]
elif
self
.
add_extra_convs
==
'on_lateral'
:
extra_source
=
laterals
[
-
1
]
elif
self
.
add_extra_convs
==
'on_output'
:
extra_source
=
outs
[
-
1
]
else
:
raise
NotImplementedError
outs
.
append
(
self
.
fpn_convs
[
used_backbone_levels
](
extra_source
))
for
i
in
range
(
used_backbone_levels
+
1
,
self
.
num_outs
):
if
self
.
relu_before_extra_convs
:
outs
.
append
(
self
.
fpn_convs
[
i
](
F
.
relu
(
outs
[
-
1
])))
else
:
outs
.
append
(
self
.
fpn_convs
[
i
](
outs
[
-
1
]))
return
outs
[
0
]
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_ipm_view_transformer.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
copy
import
math
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.runner
import
BaseModule
from
mmdet3d.models
import
NECKS
def
get_campos
(
reference_points
,
ego2cam
,
img_shape
):
'''
Find the each refence point's corresponding pixel in each camera
Args:
reference_points: [B, num_query, 3]
ego2cam: (B, num_cam, 4, 4)
Outs:
reference_points_cam: (B*num_cam, num_query, 2)
mask: (B, num_cam, num_query)
num_query == W*H
'''
ego2cam
=
reference_points
.
new_tensor
(
ego2cam
)
# (B, N, 4, 4)
reference_points
=
reference_points
.
clone
()
B
,
num_query
=
reference_points
.
shape
[:
2
]
num_cam
=
ego2cam
.
shape
[
1
]
# reference_points (B, num_queries, 4)
reference_points
=
torch
.
cat
(
(
reference_points
,
torch
.
ones_like
(
reference_points
[...,
:
1
])),
-
1
)
reference_points
=
reference_points
.
view
(
B
,
1
,
num_query
,
4
).
repeat
(
1
,
num_cam
,
1
,
1
).
unsqueeze
(
-
1
)
ego2cam
=
ego2cam
.
view
(
B
,
num_cam
,
1
,
4
,
4
).
repeat
(
1
,
1
,
num_query
,
1
,
1
)
# reference_points_cam (B, num_cam, num_queries, 4)
reference_points_cam
=
(
ego2cam
@
reference_points
).
squeeze
(
-
1
)
eps
=
1e-9
mask
=
(
reference_points_cam
[...,
2
:
3
]
>
eps
)
reference_points_cam
=
\
reference_points_cam
[...,
0
:
2
]
/
\
reference_points_cam
[...,
2
:
3
]
+
eps
reference_points_cam
[...,
0
]
/=
img_shape
[
1
]
reference_points_cam
[...,
1
]
/=
img_shape
[
0
]
# from 0~1 to -1~1
reference_points_cam
=
(
reference_points_cam
-
0.5
)
*
2
mask
=
(
mask
&
(
reference_points_cam
[...,
0
:
1
]
>
-
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
<
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
>
-
1.0
)
&
(
reference_points_cam
[...,
1
:
2
]
<
1.0
))
# (B, num_cam, num_query)
mask
=
mask
.
view
(
B
,
num_cam
,
num_query
)
reference_points_cam
=
reference_points_cam
.
view
(
B
*
num_cam
,
num_query
,
2
)
return
reference_points_cam
,
mask
def
construct_plane_grid
(
xbound
,
ybound
,
height
:
float
,
dtype
=
torch
.
float32
):
'''
Returns:
plane: H, W, 3
'''
xmin
,
xmax
=
xbound
[
0
],
xbound
[
1
]
num_x
=
int
((
xbound
[
1
]
-
xbound
[
0
])
/
xbound
[
2
])
ymin
,
ymax
=
ybound
[
0
],
ybound
[
1
]
num_y
=
int
((
ybound
[
1
]
-
ybound
[
0
])
/
ybound
[
2
])
x
=
torch
.
linspace
(
xmin
,
xmax
,
num_x
,
dtype
=
dtype
)
y
=
torch
.
linspace
(
ymin
,
ymax
,
num_y
,
dtype
=
dtype
)
# [num_y, num_x]
y
,
x
=
torch
.
meshgrid
(
y
,
x
)
z
=
torch
.
ones_like
(
x
)
*
height
# [num_y, num_x, 3]
plane
=
torch
.
stack
([
x
,
y
,
z
],
dim
=-
1
)
return
plane
@
NECKS
.
register_module
()
class
CustomIPMViewTransformer
(
BaseModule
):
r
"""
Notes
-----
Adapted from https://github.com/Mrmoore98/VectorMapNet_code/blob/mian/plugin/models/backbones/ipm_backbone.py#L238.
"""
def
__init__
(
self
,
num_cam
,
xbound
,
ybound
,
zbound
,
out_channels
,
):
super
().
__init__
()
self
.
x_bound
=
xbound
self
.
y_bound
=
ybound
heights
=
[
zbound
[
0
]
+
i
*
zbound
[
2
]
for
i
in
range
(
int
((
zbound
[
1
]
-
zbound
[
0
])
//
zbound
[
2
])
+
1
)]
self
.
heights
=
heights
self
.
num_cam
=
num_cam
self
.
outconvs
=
\
nn
.
Conv2d
((
out_channels
+
3
)
*
len
(
heights
),
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# same
# bev_plane
bev_planes
=
[
construct_plane_grid
(
xbound
,
ybound
,
h
)
for
h
in
self
.
heights
]
self
.
register_buffer
(
'bev_planes'
,
torch
.
stack
(
bev_planes
),)
# nlvl,bH,bW,2
def
forward
(
self
,
cam_feat
,
ego2cam
,
img_shape
):
'''
inverse project
Args:
cam_feat: B*ncam, C, cH, cW
img_shape: tuple(H, W)
Returns:
project_feat: B, C, nlvl, bH, bW
bev_feat_mask: B, 1, nlvl, bH, bW
'''
B
=
ego2cam
.
shape
[
0
]
C
=
cam_feat
.
shape
[
1
]
bev_grid
=
self
.
bev_planes
.
unsqueeze
(
0
).
repeat
(
B
,
1
,
1
,
1
,
1
)
nlvl
,
bH
,
bW
=
bev_grid
.
shape
[
1
:
4
]
bev_grid
=
bev_grid
.
flatten
(
1
,
3
)
# B, nlvl*W*H, 3
# Find points in cam coords
# bev_grid_pos: B*ncam, nlvl*bH*bW, 2
bev_grid_pos
,
bev_cam_mask
=
get_campos
(
bev_grid
,
ego2cam
,
img_shape
)
# B*cam, nlvl*bH, bW, 2
bev_grid_pos
=
bev_grid_pos
.
unflatten
(
-
2
,
(
nlvl
*
bH
,
bW
))
# project feat from 2D to bev plane
projected_feature
=
F
.
grid_sample
(
cam_feat
,
bev_grid_pos
).
view
(
B
,
-
1
,
C
,
nlvl
,
bH
,
bW
)
# B,cam,C,nlvl,bH,bW
# B,cam,nlvl,bH,bW
bev_feat_mask
=
bev_cam_mask
.
unflatten
(
-
1
,
(
nlvl
,
bH
,
bW
))
# eliminate the ncam
# The bev feature is the sum of the 6 cameras
bev_feat_mask
=
bev_feat_mask
.
unsqueeze
(
2
)
projected_feature
=
(
projected_feature
*
bev_feat_mask
).
sum
(
1
)
num_feat
=
bev_feat_mask
.
sum
(
1
)
projected_feature
=
projected_feature
/
\
num_feat
.
masked_fill
(
num_feat
==
0
,
1
)
# concatenate a position information
# projected_feature: B, bH, bW, nlvl, C+3
bev_grid
=
bev_grid
.
view
(
B
,
nlvl
,
bH
,
bW
,
3
).
permute
(
0
,
4
,
1
,
2
,
3
)
projected_feature
=
torch
.
cat
(
(
projected_feature
,
bev_grid
),
dim
=
1
)
bev_feat
,
bev_feat_mask
=
projected_feature
,
bev_feat_mask
.
sum
(
1
)
>
0
# multi level into a same
bev_feat
=
bev_feat
.
flatten
(
1
,
2
)
bev_feat
=
self
.
outconvs
(
bev_feat
)
return
bev_feat
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
0 → 100644
View file @
80e8c1d3
custom_imports
=
dict
(
imports
=
[
'projects.openlanev2.baseline'
])
method_para
=
dict
(
n_control
=
5
)
# #point for each curve
_dim_
=
128
model
=
dict
(
type
=
'Baseline'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
18
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
'torchvision://resnet18'
)),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
_dim_
*
2
,
_dim_
*
4
],
out_channels
=
_dim_
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'CustomIPMViewTransformer'
,
num_cam
=
7
,
xbound
=
[
-
50.0
,
50.0
,
1.0
],
ybound
=
[
-
25.0
,
25.0
,
1.0
],
zbound
=
[
-
3.0
,
2.0
,
0.5
],
out_channels
=
_dim_
),
lc_head
=
dict
(
type
=
'CustomDETRHead'
,
num_classes
=
1
,
in_channels
=
_dim_
,
num_query
=
50
,
object_type
=
'lane'
,
num_layers
=
1
,
num_reg_dim
=
method_para
[
'n_control'
]
*
3
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
# dummy
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'LaneHungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'LaneL1Cost'
,
weight
=
2.5
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
))),
# dummy
bev_range
=
[
-
50.0
,
-
25.0
,
-
3.0
,
50.0
,
25.0
,
2.0
]),
te_head
=
dict
(
type
=
'CustomDETRHead'
,
num_classes
=
13
,
in_channels
=
_dim_
,
num_query
=
30
,
object_type
=
'bbox'
,
num_layers
=
1
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.0
),
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
2.5
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
1.0
)))),
lclc_head
=
dict
(
type
=
'TopologyHead'
,
in_channels
=
128
,
hidden_channels
=
_dim_
,
out_channels
=
1
,
num_layers
=
3
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
)),
lcte_head
=
dict
(
type
=
'TopologyHead'
,
in_channels
=
128
,
hidden_channels
=
_dim_
,
out_channels
=
1
,
num_layers
=
3
,
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
)))
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomParameterizeLane'
,
method
=
'bezier_Endpointfixed'
,
method_para
=
method_para
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_lc'
,
'gt_lc_labels'
,
'gt_te'
,
'gt_te_labels'
,
'gt_topology_lclc'
,
'gt_topology_lcte'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
dataset_type
=
'OpenLaneV2SubsetADataset'
data_root
=
'OpenLane-V2/data/OpenLane-V2'
meta_root
=
'OpenLane-V2/data/OpenLane-V2'
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_train'
,
pipeline
=
train_pipeline
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-4
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
20
)
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
1
)
# yapf:disable
log_config
=
dict
(
interval
=
10
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
# yapf:enable
dist_params
=
dict
(
backend
=
'nccl'
)
log_level
=
'INFO'
load_from
=
None
resume_from
=
None
workflow
=
[(
'train'
,
1
)]
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
0 → 100644
View file @
80e8c1d3
custom_imports
=
dict
(
imports
=
[
'plugin.mmdet3d.baseline'
])
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
25.6
,
-
2.3
,
51.2
,
25.6
,
1.7
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
class_names
=
[
'centerline'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
num_cams
=
7
Map_size
=
[(
-
50
,
50
),
(
-
25
,
25
)]
method_para
=
dict
(
n_control
=
5
)
# #point for each curve
code_size
=
3
*
method_para
[
'n_control'
]
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_ffn_cfg_
=
dict
(
type
=
'FFN'
,
embed_dims
=
_dim_
,
feedforward_channels
=
_ffn_dim_
,
num_fcs
=
2
,
ffn_drop
=
0.1
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
_num_levels_
=
4
bev_h_
=
100
bev_w_
=
200
model
=
dict
(
type
=
'ROAD_BEVFormer'
,
video_test_mode
=
False
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
,
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
'torchvision://resnet50'
)),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
bev_constructor
=
dict
(
type
=
'BEVFormerConstructer'
,
num_feature_levels
=
_num_levels_
,
num_cams
=
num_cams
,
embed_dims
=
_dim_
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
pc_range
=
point_cloud_range
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
rotate_center
=
[
bev_h_
//
2
,
bev_w_
//
2
],
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
embed_dims
=
_dim_
,
num_cams
=
num_cams
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
)
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
),
bbox_head
=
dict
(
type
=
'TEDeformableDETRHead'
,
num_query
=
100
,
num_classes
=
13
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'DeformableDetrTransformer'
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
),
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DeformableDetrTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
_pos_dim_
,
normalize
=
True
,
offset
=-
0.5
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.0
),
test_cfg
=
dict
(
max_per_img
=
50
)),
pts_bbox_head
=
dict
(
type
=
'LCDeformableDETRHead'
,
num_classes
=
1
,
in_channels
=
_dim_
,
num_query
=
100
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
sync_cls_avg_factor
=
False
,
with_box_refine
=
False
,
with_shared_param
=
False
,
code_size
=
code_size
,
code_weights
=
[
1.0
for
i
in
range
(
code_size
)],
pc_range
=
point_cloud_range
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
embed_dims
=
_dim_
,
decoder
=
dict
(
type
=
'LaneDetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.5
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0075
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
lclc_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
lcte_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
# model training and testing settings
bbox_train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
2.5
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
1.0
))),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'LaneHungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.5
),
reg_cost
=
dict
(
type
=
'LaneL1Cost'
,
weight
=
0.0075
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
))))
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomParameterizeLane'
,
method
=
'bezier_Endpointfixed'
,
method_para
=
method_para
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_lc'
,
'gt_lc_labels'
,
'gt_te'
,
'gt_te_labels'
,
'gt_topology_lclc'
,
'gt_topology_lcte'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
dataset_type
=
'OpenLaneV2SubsetADataset'
data_root
=
'data/OpenLane-V2'
meta_root
=
'data/OpenLane-V2'
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_train'
,
pipeline
=
train_pipeline
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
1
)
dist_params
=
dict
(
backend
=
'nccl'
)
log_level
=
'INFO'
load_from
=
None
resume_from
=
None
workflow
=
[(
'train'
,
1
)]
\ No newline at end of file
autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
0 → 100644
View file @
80e8c1d3
custom_imports
=
dict
(
imports
=
[
'plugin.mmdet3d.baseline'
])
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
25.6
,
-
2.3
,
51.2
,
25.6
,
1.7
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
class_names
=
[
'centerline'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
num_cams
=
7
Map_size
=
[(
-
50
,
50
),
(
-
25
,
25
)]
method_para
=
dict
(
n_control
=
5
)
# #point for each curve
code_size
=
3
*
method_para
[
'n_control'
]
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_ffn_cfg_
=
dict
(
type
=
'FFN'
,
embed_dims
=
_dim_
,
feedforward_channels
=
_ffn_dim_
,
num_fcs
=
2
,
ffn_drop
=
0.1
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
_num_levels_
=
4
bev_h_
=
100
bev_w_
=
200
pretrained
=
'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_s_1k_224.pth'
model
=
dict
(
type
=
'ROAD_BEVFormer'
,
video_test_mode
=
False
,
img_backbone
=
dict
(
type
=
'InternImage'
,
core_op
=
'DCNv3'
,
channels
=
80
,
depths
=
[
4
,
4
,
21
,
4
],
groups
=
[
5
,
10
,
20
,
40
],
mlp_ratio
=
4.
,
drop_path_rate
=
0.3
,
norm_layer
=
'LN'
,
layer_scale
=
1.0
,
offset_scale
=
1.0
,
post_norm
=
True
,
with_cp
=
False
,
out_indices
=
(
0
,
1
,
2
,
3
),
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
pretrained
)),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
80
,
160
,
320
,
640
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
bev_constructor
=
dict
(
type
=
'BEVFormerConstructer'
,
num_feature_levels
=
_num_levels_
,
num_cams
=
num_cams
,
embed_dims
=
_dim_
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
pc_range
=
point_cloud_range
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
rotate_center
=
[
bev_h_
//
2
,
bev_w_
//
2
],
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
embed_dims
=
_dim_
,
num_cams
=
num_cams
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
)
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
),
bbox_head
=
dict
(
type
=
'TEDeformableDETRHead'
,
num_query
=
100
,
num_classes
=
13
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'DeformableDetrTransformer'
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
),
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DeformableDetrTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
_dim_
)
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
_pos_dim_
,
normalize
=
True
,
offset
=-
0.5
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
2.5
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.0
),
test_cfg
=
dict
(
max_per_img
=
50
)),
pts_bbox_head
=
dict
(
type
=
'LCDeformableDETRHead'
,
num_classes
=
1
,
in_channels
=
_dim_
,
num_query
=
100
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
sync_cls_avg_factor
=
False
,
with_box_refine
=
False
,
with_shared_param
=
False
,
code_size
=
code_size
,
code_weights
=
[
1.0
for
i
in
range
(
code_size
)],
pc_range
=
point_cloud_range
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
embed_dims
=
_dim_
,
decoder
=
dict
(
type
=
'LaneDetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'CustomDetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
ffn_cfgs
=
_ffn_cfg_
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.5
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0075
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
lclc_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
lcte_head
=
dict
(
type
=
'RelationshipHead'
,
in_channels_o1
=
_dim_
,
in_channels_o2
=
_dim_
,
shared_param
=
False
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
5
)),
# model training and testing settings
bbox_train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
2.5
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
1.0
))),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'LaneHungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
1.5
),
reg_cost
=
dict
(
type
=
'LaneL1Cost'
,
weight
=
0.0075
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
))))
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomParameterizeLane'
,
method
=
'bezier_Endpointfixed'
,
method_para
=
method_para
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_lc'
,
'gt_lc_labels'
,
'gt_te'
,
'gt_te_labels'
,
'gt_topology_lclc'
,
'gt_topology_lcte'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'ResizeFrontView'
),
dict
(
type
=
'CustomPadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'CustomDefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
],
meta_keys
=
[
'scene_token'
,
'sample_idx'
,
'img_paths'
,
'img_shape'
,
'scale_factor'
,
'pad_shape'
,
'lidar2img'
,
'can_bus'
,
],
)
]
dataset_type
=
'OpenLaneV2SubsetADataset'
data_root
=
'data/OpenLane-V2'
meta_root
=
'data/OpenLane-V2'
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_train'
,
pipeline
=
train_pipeline
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
meta_root
=
meta_root
,
collection
=
'data_dict_subset_A_val'
,
pipeline
=
test_pipeline
,
test_mode
=
True
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
1
)
dist_params
=
dict
(
backend
=
'nccl'
)
log_level
=
'INFO'
load_from
=
None
resume_from
=
None
workflow
=
[(
'train'
,
1
)]
\ No newline at end of file
autonomous_driving/openlane-v2/requirements.txt
0 → 100644
View file @
80e8c1d3
tqdm
ninja
jupyter
openmim
matplotlib
numpy >=1.22.0, <1.24.0
scikit-learn
similaritymeasures
opencv-python
scipy ==1.8.0
ortools ==9.2.9972
iso3166
chardet
autonomous_driving/openlane-v2/setup.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# setup.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from
setuptools
import
setup
,
find_packages
setup
(
name
=
'openlanev2'
,
version
=
'0.1.0'
,
author
=
'The OpenLane-V2 Dataset Authors'
,
author_email
=
'wanghuijie@pjlab.org.cn'
,
description
=
'The official devkit of the OpenLane-V2 dataset.'
,
url
=
'https://github.com/OpenDriveLab/OpenLane-V2'
,
packages
=
find_packages
(),
license
=
'Apache License 2.0'
,
)
autonomous_driving/openlane-v2/tutorial.ipynb
0 → 100644
View file @
80e8c1d3
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment