Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
TS-MODELS-OPT
training
Autonomous-Driving-models
Commits
007f2e68
"platforms/common/src/kernels/customNonbondedGroups.cc" did not exist on "06db7ac4ad1ebf20281314daf3af6e60307a8963"
Commit
007f2e68
authored
Apr 08, 2026
by
雍大凯
Browse files
将子模块转换为普通目录
parent
19472568
Changes
192
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4531 additions
and
0 deletions
+4531
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
...r/projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
+360
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
...r/projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
+360
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
...jects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
+349
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
...jects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
+349
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
...r/projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
+360
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
...r/projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
+360
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
...r/projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
+361
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/datasets/custom_lyft-3d.py
...mer/BEVFormer/projects/configs/datasets/custom_lyft-3d.py
+136
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/datasets/custom_nus-3d.py
...rmer/BEVFormer/projects/configs/datasets/custom_nus-3d.py
+141
-0
docker-hub/BEVFormer/BEVFormer/projects/configs/datasets/custom_waymo-3d.py
...er/BEVFormer/projects/configs/datasets/custom_waymo-3d.py
+112
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/__init__.py
...b/BEVFormer/BEVFormer/projects/mmdet3d_plugin/__init__.py
+11
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/__init__.py
...r/BEVFormer/projects/mmdet3d_plugin/bevformer/__init__.py
+6
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/__init__.py
...Former/projects/mmdet3d_plugin/bevformer/apis/__init__.py
+3
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
...mer/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
+283
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py_old
...projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py_old
+200
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/test.py
.../BEVFormer/projects/mmdet3d_plugin/bevformer/apis/test.py
+256
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/train.py
...BEVFormer/projects/mmdet3d_plugin/bevformer/apis/train.py
+67
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
...projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
+2
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/dense_heads/bev_head.py
...projects/mmdet3d_plugin/bevformer/dense_heads/bev_head.py
+132
-0
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
...ts/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
+683
-0
No files found.
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.3805
# mATE: 0.7198
# mASE: 0.2805
# mAOE: 0.4131
# mAVE: 0.7652
# mAAE: 0.1951
# NDS: 0.4529
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.3953
# mATE: 0.6941
# mASE: 0.2765
# mAOE: 0.4199
# mAVE: 0.7537
# mAAE: 0.1866
# NDS: 0.4646
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
44
,
])
total_epochs
=
48
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.3512
# mATE: 0.7534
# mASE: 0.2863
# mAOE: 0.4665
# mAVE: 0.8070
# mAAE: 0.1861
# NDS: 0.4257
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.3594
# mATE: 0.7327
# mASE: 0.2814
# mAOE: 0.4074
# mAVE: 0.7831
# mAAE: 0.1983
# NDS: 0.4394
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
44
,
])
total_epochs
=
48
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.4199
# mATE: 0.6689
# mASE: 0.2814
# mAOE: 0.3915
# mAVE: 0.3834
# mAAE: 0.1928
# NDS: 0.5182
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
-
1
,
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.4313
# mATE: 0.6557
# mASE: 0.2775
# mAOE: 0.3851
# mAVE: 0.3861
# mAAE: 0.1882
# NDS: 0.5264
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
-
1
,
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
0 → 100644
View file @
007f2e68
# mAP: 0.4600
# mATE: 0.6185
# mASE: 0.2815
# mAOE: 0.3660
# mAVE: 0.3157
# mAAE: 0.1902
# NDS: 0.5528
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
-
7
,
-
6
,
-
5
,
-
4
,
-
3
,
-
2
,
-
1
,
0
)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
inter_channels
=
_dim_
*
2
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
docker-hub/BEVFormer/BEVFormer/projects/configs/datasets/custom_lyft-3d.py
0 → 100644
View file @
007f2e68
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
80
,
-
80
,
-
5
,
80
,
80
,
3
]
# For Lyft we usually do 9-class detection
class_names
=
[
'car'
,
'truck'
,
'bus'
,
'emergency_vehicle'
,
'other_vehicle'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'animal'
]
dataset_type
=
'CustomLyftDataset'
data_root
=
'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
False
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/lyft/': 's3://lyft/lyft/',
# 'data/lyft/': 's3://lyft/lyft/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
),
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
-
0.3925
,
0.3925
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
,
flip_ratio_bev_horizontal
=
0.5
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'PointShuffle'
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
0
,
0
],
scale_ratio_range
=
[
1.
,
1.
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
]
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
2
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation
=
dict
(
interval
=
24
,
pipeline
=
eval_pipeline
)
\ No newline at end of file
docker-hub/BEVFormer/BEVFormer/projects/configs/datasets/custom_nus-3d.py
0 → 100644
View file @
007f2e68
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
50
,
-
50
,
-
5
,
50
,
50
,
3
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'trailer'
,
'bus'
,
'construction_vehicle'
,
'bicycle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'barrier'
]
dataset_type
=
'NuScenesDataset_eval_modified'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
False
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
),
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
-
0.3925
,
0.3925
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
,
flip_ratio_bev_horizontal
=
0.5
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'PointShuffle'
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
0
,
0
],
scale_ratio_range
=
[
1.
,
1.
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
ann_file
=
data_root
+
'nuscenes_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation
=
dict
(
interval
=
24
,
pipeline
=
eval_pipeline
)
docker-hub/BEVFormer/BEVFormer/projects/configs/datasets/custom_waymo-3d.py
0 → 100644
View file @
007f2e68
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type
=
'CustomWaymoDataset'
data_root
=
'data/waymo/kitti_format/'
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
class_names
=
[
'Car'
,
'Pedestrian'
,
'Cyclist'
]
point_cloud_range
=
[
-
74.88
,
-
74.88
,
-
2
,
74.88
,
74.88
,
4
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
)
db_sampler
=
dict
(
data_root
=
data_root
,
info_path
=
data_root
+
'waymo_dbinfos_train.pkl'
,
rate
=
1.0
,
prepare
=
dict
(
filter_by_difficulty
=
[
-
1
],
filter_by_min_points
=
dict
(
Car
=
5
,
Pedestrian
=
10
,
Cyclist
=
10
)),
classes
=
class_names
,
sample_groups
=
dict
(
Car
=
15
,
Pedestrian
=
10
,
Cyclist
=
10
),
points_loader
=
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
[
0
,
1
,
2
,
3
,
4
],
file_client_args
=
file_client_args
))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1920
,
1280
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
'RepeatDataset'
,
times
=
2
,
dataset
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_train.pkl'
,
split
=
'training'
,
pipeline
=
train_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
False
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
,
# load one frame every five frames
load_interval
=
5
)),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_val.pkl'
,
split
=
'training'
,
pipeline
=
test_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_val.pkl'
,
split
=
'training'
,
pipeline
=
test_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
))
evaluation
=
dict
(
interval
=
24
,
pipeline
=
test_pipeline
)
\ No newline at end of file
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/__init__.py
0 → 100755
View file @
007f2e68
from
.core.bbox.assigners.hungarian_assigner_3d
import
HungarianAssigner3D
from
.core.bbox.coders.nms_free_coder
import
NMSFreeCoder
from
.core.bbox.match_costs
import
BBox3DL1Cost
from
.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
.datasets.pipelines
import
(
PhotoMetricDistortionMultiViewImage
,
PadMultiViewImage
,
NormalizeMultiviewImage
,
CustomCollect3D
)
from
.models.utils
import
*
from
.models.opt.adamw
import
AdamW2
from
.bevformer
import
*
from
.dd3d
import
*
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/__init__.py
0 → 100644
View file @
007f2e68
from
.dense_heads
import
*
from
.detectors
import
*
from
.modules
import
*
from
.runner
import
*
from
.hooks
import
*
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/__init__.py
0 → 100644
View file @
007f2e68
from
.train
import
custom_train_model
from
.mmdet_train
import
custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
0 → 100644
View file @
007f2e68
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
from
mmcv.runner
import
Hook
class
ProfilerHook
(
Hook
):
def
__init__
(
self
,
profiler
,
total_steps
):
self
.
profiler
=
profiler
self
.
total_steps
=
total_steps
# 总步数 (wait + warmup + active) * repeat
self
.
stopped
=
False
def
after_train_iter
(
self
,
runner
):
if
self
.
profiler
.
step_num
==
self
.
total_steps
and
not
self
.
stopped
:
# 停止Profiler
self
.
profiler
.
stop
()
self
.
stopped
=
True
# 只在rank 0上打印结果
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
# 获取并打印关键指标
# table = self.profiler.key_averages().table(
# sort_by="self_cuda_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
# table = self.profiler.key_averages().table(
# sort_by="self_cpu_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
results
=
self
.
profiler
.
key_averages
().
table
(
sort_by
=
"cuda_time_total"
)
log_file
=
"/workspace/BEVFormer/profiler_logs/BW_log_step{}.txt"
.
format
(
self
.
total_steps
)
with
open
(
log_file
,
mode
=
'w'
)
as
file
:
file
.
write
(
str
(
results
))
# self.profiler.start()
if
not
self
.
stopped
:
self
.
profiler
.
step
()
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
# cfg.gpus will be ignored if distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
for
ds
in
dataset
]
# put model on gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
# Support batch_size > 1 in validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
if
False
:
# 创建profiler配置
total_steps
=
(
1
+
20
+
1
)
*
1
# 22 steps
profiler
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
1
,
# 跳过前1个step
warmup
=
20
,
# 预热1个step(不计入结果)
active
=
1
,
# 分析3个step
repeat
=
1
# 只执行一轮
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
# f"{cfg.work_dir}/profiler_logs" # 输出目录
"/workspace/BEVFormer/profiler_logs"
# "./profiler_logs"
),
with_stack
=
True
,
# 收集调用栈信息
profile_memory
=
False
,
# 分析内存使用
record_shapes
=
False
# 记录张量形状
)
# 创建并注册ProfilerHook
# profiler_hook = ProfilerHook(profiler)
profiler_hook
=
ProfilerHook
(
profiler
,
total_steps
)
runner
.
register_hook
(
profiler_hook
)
# 启动profiler
profiler
.
start
()
print
(
"==================================== profiler.start()==================================================================="
)
try
:
# 运行训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
finally
:
# 确保profiler停止
profiler
.
stop
()
else
:
# 正常训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
# runner.run(data_loaders, cfg.workflow)
#runner.run(data_loaders, cfg.workflow)
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py_old
0 → 100644
View file @
007f2e68
#
---------------------------------------------
#
Copyright
(
c
)
OpenMMLab
.
All
rights
reserved
.
#
---------------------------------------------
#
Modified
by
Zhiqi
Li
#
---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch
.
distributed
as
dist
from
mmcv
.
parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv
.
runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv
.
utils
import
build_from_cfg
from
mmdet
.
core
import
EvalHook
from
mmdet
.
datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet
.
utils
import
get_root_logger
import
time
import
os
.
path
as
osp
from
projects
.
mmdet3d_plugin
.
datasets
.
builder
import
build_dataloader
from
projects
.
mmdet3d_plugin
.
core
.
evaluation
.
eval_hooks
import
CustomDistEvalHook
from
projects
.
mmdet3d_plugin
.
datasets
import
custom_build_dataset
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
#
prepare
data
loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#
assert
len
(
dataset
)==
1
s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
f
'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
f
'={cfg.data.imgs_per_gpu} is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'{cfg.data.imgs_per_gpu} in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
#
cfg
.
gpus
will
be
ignored
if
distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
#
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
#
dict
(
type
=
'DistributedSampler'
),
)
for
ds
in
dataset
]
#
put
model
on
gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
#
Sets
the
`
find_unused_parameters
`
parameter
in
#
torch
.
nn
.
parallel
.
DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
),
device_ids
=[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
),
device_ids
=[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
#
build
runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
#
an
ugly
workaround
to
make
.
log
and
.
log
.
json
filenames
the
same
runner
.
timestamp
=
timestamp
#
fp16
setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
#
register
hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
#
register
profiler
hook
#
trace_config
=
dict
(
type
=
'tb_trace'
,
dir_name
=
'work_dir'
)
#
profiler_config
=
dict
(
on_trace_ready
=
trace_config
)
#
runner
.
register_profiler_hook
(
profiler_config
)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
#
register
eval
hooks
if
validate
:
#
Support
batch_size
>
1
in
validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
#
Replace
'ImageToTensor'
to
'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
#
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
#
dict
(
type
=
'DistributedSampler'
),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!= 'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
#
user
-
defined
hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got {type(custom_hooks)}'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'{type(hook_cfg)}'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/test.py
0 → 100644
View file @
007f2e68
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
os.path
as
osp
import
pickle
import
shutil
import
tempfile
import
time
import
mmcv
import
torch
import
torch.distributed
as
dist
from
mmcv.image
import
tensor2imgs
from
mmcv.runner
import
get_dist_info
from
mmdet.core
import
encode_mask_results
import
mmcv
import
numpy
as
np
import
pycocotools.mask
as
mask_util
def
custom_encode_mask_results
(
mask_results
):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms
=
mask_results
num_classes
=
len
(
cls_segms
)
encoded_mask_results
=
[]
for
i
in
range
(
len
(
cls_segms
)):
encoded_mask_results
.
append
(
mask_util
.
encode
(
np
.
array
(
cls_segms
[
i
][:,
:,
np
.
newaxis
],
order
=
'F'
,
dtype
=
'uint8'
))[
0
])
# encoded with RLE
return
[
encoded_mask_results
]
def
custom_multi_gpu_test
(
model
,
data_loader
,
tmpdir
=
None
,
gpu_collect
=
False
):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model
=
model
.
to
(
memory_format
=
torch
.
channels_last
)
model
.
eval
()
bbox_results
=
[]
mask_results
=
[]
dataset
=
data_loader
.
dataset
rank
,
world_size
=
get_dist_info
()
if
rank
==
0
:
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
time
.
sleep
(
2
)
# This line can prevent deadlock problem in some cases.
have_mask
=
False
'''
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
# encode mask results
if isinstance(result, dict):
if 'bbox_results' in result.keys():
bbox_result = result['bbox_results']
batch_size = len(result['bbox_results'])
bbox_results.extend(bbox_result)
if 'mask_results' in result.keys() and result['mask_results'] is not None:
mask_result = custom_encode_mask_results(result['mask_results'])
mask_results.extend(mask_result)
have_mask = True
else:
batch_size = len(result)
bbox_results.extend(result)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if rank == 0:
for _ in range(batch_size * world_size):
prog_bar.update()
'''
from
torch.profiler
import
profile
,
record_function
,
ProfilerActivity
# 1. 初始化 Profiler
prof
=
torch
.
profiler
.
profile
(
activities
=
[
ProfilerActivity
.
CPU
,
ProfilerActivity
.
CUDA
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
10
,
warmup
=
10
,
active
=
2
,
repeat
=
1
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
'/workspace/BEVFormer/profiler_logs/'
),
record_shapes
=
True
,
with_stack
=
True
)
start_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
prof
.
start
()
# 开启 Profiler
for
i
,
data
in
enumerate
(
data_loader
):
# 记录 GPU 开始时间 (异步,不阻塞)
start_event
.
record
()
with
torch
.
no_grad
():
# Profiler 作用域:标记这部分为 Inference
with
record_function
(
"model_inference"
):
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
if
isinstance
(
result
,
dict
):
if
'bbox_results'
in
result
.
keys
():
bbox_result
=
result
[
'bbox_results'
]
batch_size
=
len
(
result
[
'bbox_results'
])
bbox_results
.
extend
(
bbox_result
)
if
'mask_results'
in
result
.
keys
()
and
result
[
'mask_results'
]
is
not
None
:
mask_result
=
custom_encode_mask_results
(
result
[
'mask_results'
])
mask_results
.
extend
(
mask_result
)
have_mask
=
True
else
:
batch_size
=
len
(
result
)
bbox_results
.
extend
(
result
)
# 记录 GPU 结束时间 (异步,不阻塞)
end_event
.
record
()
# 性能分析器步进
prof
.
step
()
# 3. 策略性打印:每 50 轮计算一次,减少同步带来的开销
if
rank
==
0
and
i
%
20
==
0
and
i
>
0
:
torch
.
cuda
.
synchronize
()
# 计算的是最近一轮的时间,虽然有同步,但频率低了 20 倍
iter_time
=
start_event
.
elapsed_time
(
end_event
)
/
1000.0
print
(
f
"[Iter
{
i
}
] Latency:
{
iter_time
:.
4
f
}
s | FPS:
{
1
/
iter_time
:.
2
f
}
"
)
if
rank
==
0
:
for
_
in
range
(
batch_size
*
world_size
):
prog_bar
.
update
()
prof
.
stop
()
# 停止 Profiler
#start_event = torch.cuda.Event(enable_timing=True)
#end_event = torch.cuda.Event(enable_timing=True)
#
#for i, data in enumerate(data_loader):
# start_event.record()
#
# with torch.no_grad():
# result = model(return_loss=False, rescale=True, **data)
# # 原有逻辑不动
# if isinstance(result, dict):
# if 'bbox_results' in result.keys():
# bbox_result = result['bbox_results']
# batch_size = len(result['bbox_results'])
# bbox_results.extend(bbox_result)
# if 'mask_results' in result.keys() and result['mask_results'] is not None:
# mask_result = custom_encode_mask_results(result['mask_results'])
# mask_results.extend(mask_result)
# have_mask = True
# else:
# batch_size = len(result)
# bbox_results.extend(result)
#
# end_event.record()
#
# if rank == 0:
# torch.cuda.synchronize() # ⚠️ 只在这里同步一次(影响极小)
# iter_time = start_event.elapsed_time(end_event) / 1000.0
# print(f"[Iter {i}] time: {iter_time:.4f}s, FPS: {1/iter_time:.2f}")
#
# for _ in range(batch_size * world_size):
# prog_bar.update()
# collect results from all ranks
if
gpu_collect
:
bbox_results
=
collect_results_gpu
(
bbox_results
,
len
(
dataset
))
if
have_mask
:
mask_results
=
collect_results_gpu
(
mask_results
,
len
(
dataset
))
else
:
mask_results
=
None
else
:
bbox_results
=
collect_results_cpu
(
bbox_results
,
len
(
dataset
),
tmpdir
)
tmpdir
=
tmpdir
+
'_mask'
if
tmpdir
is
not
None
else
None
if
have_mask
:
mask_results
=
collect_results_cpu
(
mask_results
,
len
(
dataset
),
tmpdir
)
else
:
mask_results
=
None
if
mask_results
is
None
:
return
bbox_results
return
{
'bbox_results'
:
bbox_results
,
'mask_results'
:
mask_results
}
def
collect_results_cpu
(
result_part
,
size
,
tmpdir
=
None
):
rank
,
world_size
=
get_dist_info
()
# create a tmp dir if it is not specified
if
tmpdir
is
None
:
MAX_LEN
=
512
# 32 is whitespace
dir_tensor
=
torch
.
full
((
MAX_LEN
,
),
32
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
if
rank
==
0
:
mmcv
.
mkdir_or_exist
(
'.dist_test'
)
tmpdir
=
tempfile
.
mkdtemp
(
dir
=
'.dist_test'
)
tmpdir
=
torch
.
tensor
(
bytearray
(
tmpdir
.
encode
()),
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
dir_tensor
[:
len
(
tmpdir
)]
=
tmpdir
dist
.
broadcast
(
dir_tensor
,
0
)
tmpdir
=
dir_tensor
.
cpu
().
numpy
().
tobytes
().
decode
().
rstrip
()
else
:
mmcv
.
mkdir_or_exist
(
tmpdir
)
# dump the part result to the dir
mmcv
.
dump
(
result_part
,
osp
.
join
(
tmpdir
,
f
'part_
{
rank
}
.pkl'
))
dist
.
barrier
()
# collect all parts
if
rank
!=
0
:
return
None
else
:
# load results of all parts from tmp dir
part_list
=
[]
for
i
in
range
(
world_size
):
part_file
=
osp
.
join
(
tmpdir
,
f
'part_
{
i
}
.pkl'
)
part_list
.
append
(
mmcv
.
load
(
part_file
))
# sort the results
ordered_results
=
[]
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for
res
in
part_list
:
ordered_results
.
extend
(
list
(
res
))
# the dataloader may pad some samples
ordered_results
=
ordered_results
[:
size
]
# remove tmp dir
shutil
.
rmtree
(
tmpdir
)
return
ordered_results
def
collect_results_gpu
(
result_part
,
size
):
collect_results_cpu
(
result_part
,
size
)
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/apis/train.py
0 → 100644
View file @
007f2e68
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.mmdet_train
import
custom_train_detector
from
mmseg.apis
import
train_segmentor
from
mmdet.apis
import
train_detector
def
custom_train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
assert
False
else
:
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
eval_model
=
eval_model
,
meta
=
meta
)
def
train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
train_segmentor
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
else
:
train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
0 → 100644
View file @
007f2e68
from
.bevformer_head
import
BEVFormerHead
,
BEVFormerHead_GroupDETR
from
.bev_head
import
BEVHead
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/dense_heads/bev_head.py
0 → 100644
View file @
007f2e68
import
copy
from
re
import
I
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmdet.core
import
(
multi_apply
,
multi_apply
,
reduce_mean
)
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models
import
HEADS
from
mmdet.models.dense_heads
import
DETRHead
from
mmdet3d.core.bbox.coders
import
build_bbox_coder
from
traitlets
import
import_item
from
projects.mmdet3d_plugin.core.bbox.util
import
normalize_bbox
from
mmcv.cnn.bricks.transformer
import
build_positional_encoding
from
mmcv.runner
import
BaseModule
,
force_fp32
from
projects.mmdet3d_plugin.models.utils.bricks
import
run_time
import
numpy
as
np
import
mmcv
import
cv2
as
cv
from
projects.mmdet3d_plugin.bevformer.modules
import
PerceptionTransformerBEVEncoder
from
mmdet.models.utils
import
build_transformer
from
mmdet3d.models.builder
import
build_head
from
mmdet3d.models.dense_heads.free_anchor3d_head
import
FreeAnchor3DHead
@
HEADS
.
register_module
()
class
BEVHead
(
BaseModule
):
def
__init__
(
self
,
bev_h
,
bev_w
,
pc_range
,
embed_dims
,
transformer
,
positional_encoding
:
dict
,
pts_bbox_head_3d
:
dict
,
init_cfg
=
None
,
**
kwargs
,
):
super
(
BEVHead
,
self
).
__init__
(
init_cfg
=
init_cfg
)
self
.
bev_h
=
bev_h
self
.
bev_w
=
bev_w
self
.
embed_dims
=
embed_dims
self
.
pc_range
=
pc_range
self
.
fp16_enabled
=
False
self
.
transformer
:
PerceptionTransformerBEVEncoder
=
build_transformer
(
transformer
)
self
.
positional_encoding
=
build_positional_encoding
(
positional_encoding
)
pts_bbox_head_3d
.
update
(
kwargs
)
self
.
pts_bbox_head_3d
=
build_head
(
pts_bbox_head_3d
)
self
.
real_w
=
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
]
self
.
real_h
=
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
]
self
.
_init_layers
()
def
init_weights
(
self
):
"""Initialize weights of the Multi View BEV Encoder"""
self
.
transformer
.
init_weights
()
def
_init_layers
(
self
):
"""Initialize classification branch and regression branch of head."""
self
.
bev_embedding
=
nn
.
Embedding
(
self
.
bev_h
*
self
.
bev_w
,
self
.
embed_dims
)
@
force_fp32
(
apply_to
=
(
'mlvl_feats'
,
'pred_bev'
))
def
forward
(
self
,
mlvl_feats
,
img_metas
,
prev_bev
=
None
,
only_bev
=
False
):
bs
,
num_cam
,
_
,
_
,
_
=
mlvl_feats
[
0
].
shape
dtype
=
mlvl_feats
[
0
].
dtype
bev_queries
=
self
.
bev_embedding
.
weight
.
to
(
dtype
)
bev_mask
=
torch
.
zeros
((
bs
,
self
.
bev_h
,
self
.
bev_w
),
device
=
bev_queries
.
device
).
to
(
dtype
)
bev_pos
=
self
.
positional_encoding
(
bev_mask
).
to
(
dtype
)
bev_embed
=
self
.
transformer
(
mlvl_feats
,
bev_queries
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
,
)
if
only_bev
:
return
bev_embed
bev_feature
=
bev_embed
.
permute
(
0
,
2
,
1
).
reshape
(
bs
,
self
.
embed_dims
,
self
.
bev_h
,
self
.
bev_w
)
ret
=
{}
ret
[
'pred'
]
=
self
.
pts_bbox_head_3d
([
bev_feature
,])
if
not
self
.
training
:
ret
[
'bev_embed'
]
=
bev_embed
return
ret
@
force_fp32
(
apply_to
=
(
'ret'
))
def
loss
(
self
,
gt_bboxes_list
,
gt_labels_list
,
ret
,
gt_bboxes_ignore
=
None
,
img_metas
=
None
):
assert
gt_bboxes_ignore
is
None
return
self
.
pts_bbox_head_3d
.
loss
(
gt_bboxes_list
,
gt_labels_list
,
ret
[
'pred'
],
gt_bboxes_ignore
=
gt_bboxes_ignore
,
img_metas
=
img_metas
)
@
force_fp32
(
apply_to
=
(
'ret'
))
def
get_bboxes
(
self
,
ret
,
img_metas
,
rescale
=
False
):
return
self
.
pts_bbox_head_3d
.
get_bboxes
(
ret
[
'pred'
],
img_metas
)
@
HEADS
.
register_module
()
class
FreeAnchor3DHeadV2
(
FreeAnchor3DHead
):
@
force_fp32
(
apply_to
=
(
'pred'
))
def
loss
(
self
,
gt_bboxes_list
,
gt_labels_list
,
pred
,
gt_bboxes_ignore
=
None
,
img_metas
=
None
):
cls_scores
,
bbox_preds
,
dir_cls_preds
=
pred
return
super
().
loss
(
cls_scores
,
bbox_preds
,
dir_cls_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
)
@
force_fp32
(
apply_to
=
(
'pred'
))
def
get_bboxes
(
self
,
pred
,
img_metas
,
rescale
=
False
):
cls_scores
,
bbox_preds
,
dir_cls_preds
=
pred
return
super
().
get_bboxes
(
cls_scores
,
bbox_preds
,
dir_cls_preds
,
img_metas
,
cfg
=
None
,
rescale
=
rescale
)
\ No newline at end of file
docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
0 → 100644
View file @
007f2e68
import
copy
import
torch
import
torch.nn
as
nn
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmdet.core
import
(
multi_apply
,
multi_apply
,
reduce_mean
)
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models
import
HEADS
from
mmdet.models.dense_heads
import
DETRHead
from
mmdet3d.core.bbox.coders
import
build_bbox_coder
from
projects.mmdet3d_plugin.core.bbox.util
import
normalize_bbox
from
mmcv.runner
import
force_fp32
,
auto_fp16
@
HEADS
.
register_module
()
class
BEVFormerHead
(
DETRHead
):
"""Head of Detr3D.
Args:
with_box_refine (bool): Whether to refine the reference points
in the decoder. Defaults to False.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
transformer (obj:`ConfigDict`): ConfigDict is used for building
the Encoder and Decoder.
bev_h, bev_w (int): spatial shape of BEV queries.
"""
def
__init__
(
self
,
*
args
,
with_box_refine
=
False
,
as_two_stage
=
False
,
transformer
=
None
,
bbox_coder
=
None
,
num_cls_fcs
=
2
,
code_weights
=
None
,
bev_h
=
30
,
bev_w
=
30
,
**
kwargs
):
self
.
bev_h
=
bev_h
self
.
bev_w
=
bev_w
self
.
fp16_enabled
=
False
self
.
with_box_refine
=
with_box_refine
self
.
as_two_stage
=
as_two_stage
if
self
.
as_two_stage
:
transformer
[
'as_two_stage'
]
=
self
.
as_two_stage
if
'code_size'
in
kwargs
:
self
.
code_size
=
kwargs
[
'code_size'
]
else
:
self
.
code_size
=
10
if
code_weights
is
not
None
:
self
.
code_weights
=
code_weights
else
:
self
.
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
]
self
.
bbox_coder
=
build_bbox_coder
(
bbox_coder
)
self
.
pc_range
=
self
.
bbox_coder
.
pc_range
self
.
real_w
=
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
]
self
.
real_h
=
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
]
self
.
num_cls_fcs
=
num_cls_fcs
-
1
super
(
BEVFormerHead
,
self
).
__init__
(
*
args
,
transformer
=
transformer
,
**
kwargs
)
self
.
code_weights
=
nn
.
Parameter
(
torch
.
tensor
(
self
.
code_weights
,
requires_grad
=
False
),
requires_grad
=
False
)
def
_init_layers
(
self
):
"""Initialize classification branch and regression branch of head."""
cls_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
cls_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
cls_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
cls_branch
.
append
(
nn
.
ReLU
(
inplace
=
True
))
cls_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
))
fc_cls
=
nn
.
Sequential
(
*
cls_branch
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
code_size
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred
=
(
self
.
transformer
.
decoder
.
num_layers
+
1
)
if
\
self
.
as_two_stage
else
self
.
transformer
.
decoder
.
num_layers
if
self
.
with_box_refine
:
self
.
cls_branches
=
_get_clones
(
fc_cls
,
num_pred
)
self
.
reg_branches
=
_get_clones
(
reg_branch
,
num_pred
)
else
:
self
.
cls_branches
=
nn
.
ModuleList
(
[
fc_cls
for
_
in
range
(
num_pred
)])
self
.
reg_branches
=
nn
.
ModuleList
(
[
reg_branch
for
_
in
range
(
num_pred
)])
if
not
self
.
as_two_stage
:
self
.
bev_embedding
=
nn
.
Embedding
(
self
.
bev_h
*
self
.
bev_w
,
self
.
embed_dims
)
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
*
2
)
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
self
.
transformer
.
init_weights
()
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
for
m
in
self
.
cls_branches
:
nn
.
init
.
constant_
(
m
[
-
1
].
bias
,
bias_init
)
@
auto_fp16
(
apply_to
=
(
'mlvl_feats'
))
def
forward
(
self
,
mlvl_feats
,
img_metas
,
prev_bev
=
None
,
only_bev
=
False
):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 5D-tensor with shape
(B, N, C, H, W).
prev_bev: previous bev featues
only_bev: only compute BEV features with encoder.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
\
shape [nb_dec, bs, num_query, cls_out_channels]. Note
\
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
\
head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy).
\
Shape [nb_dec, bs, num_query, 9].
"""
bs
,
num_cam
,
_
,
_
,
_
=
mlvl_feats
[
0
].
shape
dtype
=
mlvl_feats
[
0
].
dtype
object_query_embeds
=
self
.
query_embedding
.
weight
.
to
(
dtype
)
bev_queries
=
self
.
bev_embedding
.
weight
.
to
(
dtype
)
bev_mask
=
torch
.
zeros
((
bs
,
self
.
bev_h
,
self
.
bev_w
),
device
=
bev_queries
.
device
).
to
(
dtype
)
bev_pos
=
self
.
positional_encoding
(
bev_mask
).
to
(
dtype
)
if
only_bev
:
# only use encoder to obtain BEV features, TODO: refine the workaround
return
self
.
transformer
.
get_bev_features
(
mlvl_feats
,
bev_queries
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
,
)
else
:
outputs
=
self
.
transformer
(
mlvl_feats
,
bev_queries
,
object_query_embeds
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
self
.
cls_branches
if
self
.
as_two_stage
else
None
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
)
bev_embed
,
hs
,
init_reference
,
inter_references
=
outputs
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
# TODO: check the shape of reference
assert
reference
.
shape
[
-
1
]
==
3
tmp
[...,
0
:
2
]
+=
reference
[...,
0
:
2
]
tmp
[...,
0
:
2
]
=
tmp
[...,
0
:
2
].
sigmoid
()
tmp
[...,
4
:
5
]
+=
reference
[...,
2
:
3
]
tmp
[...,
4
:
5
]
=
tmp
[...,
4
:
5
].
sigmoid
()
tmp
[...,
0
:
1
]
=
(
tmp
[...,
0
:
1
]
*
(
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
])
+
self
.
pc_range
[
0
])
tmp
[...,
1
:
2
]
=
(
tmp
[...,
1
:
2
]
*
(
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
])
+
self
.
pc_range
[
1
])
tmp
[...,
4
:
5
]
=
(
tmp
[...,
4
:
5
]
*
(
self
.
pc_range
[
5
]
-
self
.
pc_range
[
2
])
+
self
.
pc_range
[
2
])
# TODO: check if using sigmoid
outputs_coord
=
tmp
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
outs
=
{
'bev_embed'
:
bev_embed
,
'all_cls_scores'
:
outputs_classes
,
'all_bbox_preds'
:
outputs_coords
,
'enc_cls_scores'
:
None
,
'enc_bbox_preds'
:
None
,
}
return
outs
def
_get_target_single
(
self
,
cls_score
,
bbox_pred
,
gt_labels
,
gt_bboxes
,
gt_bboxes_ignore
=
None
):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes
=
bbox_pred
.
size
(
0
)
# assigner and sampler
gt_c
=
gt_bboxes
.
shape
[
-
1
]
assign_result
=
self
.
assigner
.
assign
(
bbox_pred
,
cls_score
,
gt_bboxes
,
gt_labels
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
bbox_pred
,
gt_bboxes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
bbox_pred
)[...,
:
gt_c
]
bbox_weights
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
# DETR
bbox_targets
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
)
def
get_targets
(
self
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
=
None
):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- bbox_targets_list (list[Tensor]): BBox targets for all
\
images.
- bbox_weights_list (list[Tensor]): BBox weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
bbox_preds_list
,
gt_labels_list
,
gt_bboxes_list
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
def
loss_single
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
=
None
):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
bbox_preds_list
=
[
bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# regression L1 loss
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
bbox_preds
.
size
(
-
1
))
normalized_bbox_targets
=
normalize_bbox
(
bbox_targets
,
self
.
pc_range
)
isnotnan
=
torch
.
isfinite
(
normalized_bbox_targets
).
all
(
dim
=-
1
)
bbox_weights
=
bbox_weights
*
self
.
code_weights
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
[
isnotnan
,
:
10
],
normalized_bbox_targets
[
isnotnan
,
:
10
],
bbox_weights
[
isnotnan
,
:
10
],
avg_factor
=
num_total_pos
)
if
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.8'
):
loss_cls
=
torch
.
nan_to_num
(
loss_cls
)
loss_bbox
=
torch
.
nan_to_num
(
loss_bbox
)
return
loss_cls
,
loss_bbox
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
loss
(
self
,
gt_bboxes_list
,
gt_labels_list
,
preds_dicts
,
gt_bboxes_ignore
=
None
,
img_metas
=
None
):
""""Loss function.
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
]
all_bbox_preds
=
preds_dicts
[
'all_bbox_preds'
]
enc_cls_scores
=
preds_dicts
[
'enc_cls_scores'
]
enc_bbox_preds
=
preds_dicts
[
'enc_bbox_preds'
]
num_dec_layers
=
len
(
all_cls_scores
)
device
=
gt_labels_list
[
0
].
device
gt_bboxes_list
=
[
torch
.
cat
(
(
gt_bboxes
.
gravity_center
,
gt_bboxes
.
tensor
[:,
3
:]),
dim
=
1
).
to
(
device
)
for
gt_bboxes
in
gt_bboxes_list
]
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
losses_cls
,
losses_bbox
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss of proposal generated from encode feature map.
if
enc_cls_scores
is
not
None
:
binary_labels_list
=
[
torch
.
zeros_like
(
gt_labels_list
[
i
])
for
i
in
range
(
len
(
all_gt_labels_list
))
]
enc_loss_cls
,
enc_losses_bbox
=
\
self
.
loss_single
(
enc_cls_scores
,
enc_bbox_preds
,
gt_bboxes_list
,
binary_labels_list
,
gt_bboxes_ignore
)
loss_dict
[
'enc_loss_cls'
]
=
enc_loss_cls
loss_dict
[
'enc_loss_bbox'
]
=
enc_losses_bbox
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
num_dec_layer
+=
1
return
loss_dict
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
get_bboxes
(
self
,
preds_dicts
,
img_metas
,
rescale
=
False
):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
preds_dicts
=
self
.
bbox_coder
.
decode
(
preds_dicts
)
num_samples
=
len
(
preds_dicts
)
ret_list
=
[]
for
i
in
range
(
num_samples
):
preds
=
preds_dicts
[
i
]
bboxes
=
preds
[
'bboxes'
]
bboxes
[:,
2
]
=
bboxes
[:,
2
]
-
bboxes
[:,
5
]
*
0.5
code_size
=
bboxes
.
shape
[
-
1
]
bboxes
=
img_metas
[
i
][
'box_type_3d'
](
bboxes
,
code_size
)
scores
=
preds
[
'scores'
]
labels
=
preds
[
'labels'
]
ret_list
.
append
([
bboxes
,
scores
,
labels
])
return
ret_list
@
HEADS
.
register_module
()
class
BEVFormerHead_GroupDETR
(
BEVFormerHead
):
def
__init__
(
self
,
*
args
,
group_detr
=
1
,
**
kwargs
):
self
.
group_detr
=
group_detr
assert
'num_query'
in
kwargs
kwargs
[
'num_query'
]
=
group_detr
*
kwargs
[
'num_query'
]
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
mlvl_feats
,
img_metas
,
prev_bev
=
None
,
only_bev
=
False
):
bs
,
num_cam
,
_
,
_
,
_
=
mlvl_feats
[
0
].
shape
dtype
=
mlvl_feats
[
0
].
dtype
object_query_embeds
=
self
.
query_embedding
.
weight
.
to
(
dtype
)
if
not
self
.
training
:
# NOTE: Only difference to bevformer head
object_query_embeds
=
object_query_embeds
[:
self
.
num_query
//
self
.
group_detr
]
bev_queries
=
self
.
bev_embedding
.
weight
.
to
(
dtype
)
bev_mask
=
torch
.
zeros
((
bs
,
self
.
bev_h
,
self
.
bev_w
),
device
=
bev_queries
.
device
).
to
(
dtype
)
bev_pos
=
self
.
positional_encoding
(
bev_mask
).
to
(
dtype
)
if
only_bev
:
return
self
.
transformer
.
get_bev_features
(
mlvl_feats
,
bev_queries
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
,
)
else
:
outputs
=
self
.
transformer
(
mlvl_feats
,
bev_queries
,
object_query_embeds
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
self
.
cls_branches
if
self
.
as_two_stage
else
None
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
)
bev_embed
,
hs
,
init_reference
,
inter_references
=
outputs
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
assert
reference
.
shape
[
-
1
]
==
3
tmp
[...,
0
:
2
]
+=
reference
[...,
0
:
2
]
tmp
[...,
0
:
2
]
=
tmp
[...,
0
:
2
].
sigmoid
()
tmp
[...,
4
:
5
]
+=
reference
[...,
2
:
3
]
tmp
[...,
4
:
5
]
=
tmp
[...,
4
:
5
].
sigmoid
()
tmp
[...,
0
:
1
]
=
(
tmp
[...,
0
:
1
]
*
(
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
])
+
self
.
pc_range
[
0
])
tmp
[...,
1
:
2
]
=
(
tmp
[...,
1
:
2
]
*
(
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
])
+
self
.
pc_range
[
1
])
tmp
[...,
4
:
5
]
=
(
tmp
[...,
4
:
5
]
*
(
self
.
pc_range
[
5
]
-
self
.
pc_range
[
2
])
+
self
.
pc_range
[
2
])
outputs_coord
=
tmp
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
outs
=
{
'bev_embed'
:
bev_embed
,
'all_cls_scores'
:
outputs_classes
,
'all_bbox_preds'
:
outputs_coords
,
'enc_cls_scores'
:
None
,
'enc_bbox_preds'
:
None
,
}
return
outs
def
loss
(
self
,
gt_bboxes_list
,
gt_labels_list
,
preds_dicts
,
gt_bboxes_ignore
=
None
,
img_metas
=
None
):
""""Loss function.
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
]
all_bbox_preds
=
preds_dicts
[
'all_bbox_preds'
]
enc_cls_scores
=
preds_dicts
[
'enc_cls_scores'
]
enc_bbox_preds
=
preds_dicts
[
'enc_bbox_preds'
]
assert
enc_cls_scores
is
None
and
enc_bbox_preds
is
None
num_dec_layers
=
len
(
all_cls_scores
)
device
=
gt_labels_list
[
0
].
device
gt_bboxes_list
=
[
torch
.
cat
(
(
gt_bboxes
.
gravity_center
,
gt_bboxes
.
tensor
[:,
3
:]),
dim
=
1
).
to
(
device
)
for
gt_bboxes
in
gt_bboxes_list
]
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
loss_dict
=
dict
()
loss_dict
[
'loss_cls'
]
=
0
loss_dict
[
'loss_bbox'
]
=
0
for
num_dec_layer
in
range
(
all_cls_scores
.
shape
[
0
]
-
1
):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
0
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
0
num_query_per_group
=
self
.
num_query
//
self
.
group_detr
for
group_index
in
range
(
self
.
group_detr
):
group_query_start
=
group_index
*
num_query_per_group
group_query_end
=
(
group_index
+
1
)
*
num_query_per_group
group_cls_scores
=
all_cls_scores
[:,
:,
group_query_start
:
group_query_end
,
:]
group_bbox_preds
=
all_bbox_preds
[:,
:,
group_query_start
:
group_query_end
,
:]
losses_cls
,
losses_bbox
=
multi_apply
(
self
.
loss_single
,
group_cls_scores
,
group_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
all_gt_bboxes_ignore_list
)
loss_dict
[
'loss_cls'
]
+=
losses_cls
[
-
1
]
/
self
.
group_detr
loss_dict
[
'loss_bbox'
]
+=
losses_bbox
[
-
1
]
/
self
.
group_detr
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
+=
loss_cls_i
/
self
.
group_detr
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
+=
loss_bbox_i
/
self
.
group_detr
num_dec_layer
+=
1
return
loss_dict
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment