Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishj6
BEVFomer
Commits
4cd43886
Commit
4cd43886
authored
Sep 01, 2025
by
lishj6
🏸
Browse files
init
parent
a9a1fe81
Changes
207
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4484 additions
and
0 deletions
+4484
-0
projects/configs/bevformer/bevformer_tiny.py
projects/configs/bevformer/bevformer_tiny.py
+271
-0
projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
+272
-0
projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
+360
-0
projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
+360
-0
projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
+349
-0
projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
+349
-0
projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
+360
-0
projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
+360
-0
projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
+361
-0
projects/configs/datasets/custom_lyft-3d.py
projects/configs/datasets/custom_lyft-3d.py
+136
-0
projects/configs/datasets/custom_nus-3d.py
projects/configs/datasets/custom_nus-3d.py
+141
-0
projects/configs/datasets/custom_waymo-3d.py
projects/configs/datasets/custom_waymo-3d.py
+112
-0
projects/mmdet3d_plugin/__init__.py
projects/mmdet3d_plugin/__init__.py
+11
-0
projects/mmdet3d_plugin/bevformer/__init__.py
projects/mmdet3d_plugin/bevformer/__init__.py
+6
-0
projects/mmdet3d_plugin/bevformer/apis/__init__.py
projects/mmdet3d_plugin/bevformer/apis/__init__.py
+3
-0
projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
+293
-0
projects/mmdet3d_plugin/bevformer/apis/mmdet_train_nhwc.py
projects/mmdet3d_plugin/bevformer/apis/mmdet_train_nhwc.py
+256
-0
projects/mmdet3d_plugin/bevformer/apis/mmdet_train_profiler.py
...cts/mmdet3d_plugin/bevformer/apis/mmdet_train_profiler.py
+253
-0
projects/mmdet3d_plugin/bevformer/apis/test.py
projects/mmdet3d_plugin/bevformer/apis/test.py
+164
-0
projects/mmdet3d_plugin/bevformer/apis/train.py
projects/mmdet3d_plugin/bevformer/apis/train.py
+67
-0
No files found.
projects/configs/bevformer/bevformer_tiny.py
0 → 100644
View file @
4cd43886
# BEvFormer-tiny consumes at lease 6700M GPU memory
# compared to bevformer_base, bevformer_tiny has
# smaller backbone: R101-DCN -> R50
# smaller BEV: 200*200 -> 50*50
# less encoder layers: 6 -> 3
# smaller input size: 1600*900 -> 800*450
# multi-scale feautres -> single scale features (C5)
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
bev_h_
=
50
bev_w_
=
50
queue_length
=
3
# each sequence contains `queue_length` frames. 3
model
=
dict
(
type
=
'BEVFormer'
,
use_grid_mask
=
True
,
video_test_mode
=
True
,
# pretrained=dict(img='torchvision://resnet50'),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
im2col_step
=
64
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.25
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBox3DL1Cost'
,
weight
=
0.25
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
16
,
workers_per_gpu
=
32
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
1
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
# load_from = 'ckpts/resnet50-19c8e357.pth'
log_config
=
dict
(
interval
=
1
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
)
projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
0 → 100644
View file @
4cd43886
# BEvFormer-tiny consumes at lease 6700M GPU memory
# compared to bevformer_base, bevformer_tiny has
# smaller backbone: R101-DCN -> R50
# smaller BEV: 200*200 -> 50*50
# less encoder layers: 6 -> 3
# smaller input size: 1600*900 -> 800*450
# multi-scale feautres -> single scale features (C5)
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
bev_h_
=
50
bev_w_
=
50
queue_length
=
3
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'BEVFormer_fp16'
,
use_grid_mask
=
True
,
video_test_mode
=
True
,
pretrained
=
dict
(
img
=
'torchvision://resnet50'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.25
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBox3DL1Cost'
,
weight
=
0.25
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2.8e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner_video'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
custom_hooks
=
[
dict
(
type
=
'TransferWeight'
,
priority
=
'LOWEST'
)]
\ No newline at end of file
projects/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.3805
# mATE: 0.7198
# mASE: 0.2805
# mAOE: 0.4131
# mAVE: 0.7652
# mAAE: 0.1951
# NDS: 0.4529
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.3953
# mATE: 0.6941
# mASE: 0.2765
# mAOE: 0.4199
# mAVE: 0.7537
# mAAE: 0.1866
# NDS: 0.4646
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
44
,
])
total_epochs
=
48
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.3512
# mATE: 0.7534
# mASE: 0.2863
# mAOE: 0.4665
# mAVE: 0.8070
# mAAE: 0.1861
# NDS: 0.4257
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.3594
# mATE: 0.7327
# mASE: 0.2814
# mAOE: 0.4074
# mAVE: 0.7831
# mAAE: 0.1983
# NDS: 0.4394
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
0
,)
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
44
,
])
total_epochs
=
48
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.4199
# mATE: 0.6689
# mASE: 0.2814
# mAOE: 0.3915
# mAVE: 0.3834
# mAAE: 0.1928
# NDS: 0.5182
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
-
1
,
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.4313
# mATE: 0.6557
# mASE: 0.2775
# mAOE: 0.3851
# mAVE: 0.3861
# mAAE: 0.1882
# NDS: 0.5264
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
-
1
,
0
,)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
0 → 100644
View file @
4cd43886
# mAP: 0.4600
# mATE: 0.6185
# mASE: 0.2815
# mAOE: 0.3660
# mAVE: 0.3157
# mAAE: 0.1902
# NDS: 0.5528
_base_
=
[
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'barrier'
,
'bicycle'
,
'bus'
,
'car'
,
'construction_vehicle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'trailer'
,
'truck'
]
dataset_type
=
'CustomNuScenesDatasetV2'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
img_norm_cfg
=
dict
(
mean
=
[
103.53
,
116.28
,
123.675
],
std
=
[
1
,
1
,
1
],
to_rgb
=
False
)
bev_h_
=
200
bev_w_
=
200
frames
=
(
-
7
,
-
6
,
-
5
,
-
4
,
-
3
,
-
2
,
-
1
,
0
)
group_detr
=
11
voxel_size
=
[
102.4
/
bev_h_
,
102.4
/
bev_w_
,
8
]
ida_aug_conf
=
{
"reisze"
:
[
512
,
544
,
576
,
608
,
640
,
672
,
704
,
736
,
768
],
# (0.8, 1.2)
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
True
,
}
ida_aug_conf_eval
=
{
"reisze"
:
[
640
,
],
"crop"
:
(
0
,
260
,
1600
,
900
),
"H"
:
900
,
"W"
:
1600
,
"rand_flip"
:
False
,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'GlobalRotScaleTransImage'
,
rot_range
=
[
-
22.5
,
22.5
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
],
reverse_angle
=
True
,
training
=
True
,
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
,
only_gt
=
True
,),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf
,
training
=
True
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
,
'mono_input_dict'
,
'mono_ann_idx'
,
'aug_param'
]),
dict
(
type
=
'DD3DMapper'
,
is_train
=
True
,
tasks
=
dict
(
box2d_on
=
True
,
box3d_on
=
True
),)
]
eval_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
,
),
dict
(
type
=
'CropResizeFlipImage'
,
data_aug_conf
=
ida_aug_conf_eval
,
training
=
False
,
debug
=
False
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
640
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'ego2global_translation'
,
'ego2global_rotation'
,
'lidar2ego_translation'
,
'lidar2ego_rotation'
,
'timestamp'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
persistent_workers
=
True
,
train
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
box_type_3d
=
'LiDAR'
,
mono_cfg
=
dict
(
name
=
'nusc_trainval'
,
data_root
=
'data/nuscenes/'
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
)),
val
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
'CustomNuScenesDatasetV2'
,
frames
=
frames
,
data_root
=
'data/nuscenes/'
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
eval_pipeline
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
))
evaluation
=
dict
(
interval
=
4
,
pipeline
=
eval_pipeline
)
# model
load_from
=
'./ckpts/fcos_r50_coco_2mmdet.pth'
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
_dim_
=
256
_pos_dim_
=
128
_ffn_dim_
=
512
_num_levels_
=
4
_num_mono_levels_
=
5
model
=
dict
(
type
=
'BEVFormerV2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
num_levels
=
_num_levels_
,
num_mono_levels
=
_num_mono_levels_
,
mono_loss_weight
=
1.0
,
frames
=
frames
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
),
norm_eval
=
False
,
style
=
'caffe'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_mono_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead_GroupDETR'
,
group_detr
=
group_detr
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformerV2'
,
embed_dims
=
_dim_
,
frames
=
frames
,
inter_channels
=
_dim_
*
2
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
4
),
embed_dims
=
_dim_
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'GroupMultiheadAttention'
,
group
=
group_detr
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
loss_weight
=
0.75
,
beta
=
1.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
fcos3d_bbox_head
=
dict
(
type
=
'NuscenesDD3D'
,
num_classes
=
10
,
in_channels
=
_dim_
,
strides
=
[
8
,
16
,
32
,
64
,
128
],
box3d_on
=
True
,
feature_locations_offset
=
'none'
,
fcos2d_cfg
=
dict
(
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'SyncBN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
),
fcos2d_loss_cfg
=
dict
(
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
),
fcos3d_cfg
=
dict
(
num_convs
=
4
,
norm
=
'SyncBN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
[
44.921
,
20.252
,
11.712
,
7.166
,
8.548
],
std_depth_per_level
=
[
24.331
,
9.833
,
6.223
,
4.611
,
8.275
]),
fcos3d_loss_cfg
=
dict
(
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
[[
2.3524184
,
0.5062202
,
1.0413622
],
[
0.61416006
,
1.7016163
,
1.3054738
],
[
2.9139307
,
10.725025
,
3.2832346
],
[
1.9751819
,
4.641267
,
1.74352
],
[
2.772134
,
6.565072
,
3.2474296
],
[
0.7800532
,
2.138673
,
1.4437162
],
[
0.6667362
,
0.7181772
,
1.7616143
],
[
0.40246472
,
0.4027083
,
1.0084083
],
[
3.0059454
,
12.8197
,
4.1213827
],
[
2.4986045
,
6.9310856
,
2.8382742
]]),
target_assign_cfg
=
dict
(
center_sample
=
True
,
pos_radius
=
1.5
,
sizes_of_interest
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
100000000.0
))),
nusc_loss_weight
=
dict
(
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
)),
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'SmoothL1Cost'
,
weight
=
0.75
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
pc_range
=
point_cloud_range
))))
# optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
dict
(
img_backbone
=
dict
(
lr_mult
=
0.5
),
)),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
2000
,
warmup_ratio
=
1.0
/
3
,
step
=
[
20
,
])
total_epochs
=
24
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
projects/configs/datasets/custom_lyft-3d.py
0 → 100644
View file @
4cd43886
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
80
,
-
80
,
-
5
,
80
,
80
,
3
]
# For Lyft we usually do 9-class detection
class_names
=
[
'car'
,
'truck'
,
'bus'
,
'emergency_vehicle'
,
'other_vehicle'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'animal'
]
dataset_type
=
'CustomLyftDataset'
data_root
=
'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
False
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/lyft/': 's3://lyft/lyft/',
# 'data/lyft/': 's3://lyft/lyft/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
),
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
-
0.3925
,
0.3925
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
,
flip_ratio_bev_horizontal
=
0.5
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'PointShuffle'
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
0
,
0
],
scale_ratio_range
=
[
1.
,
1.
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
]
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
2
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation
=
dict
(
interval
=
24
,
pipeline
=
eval_pipeline
)
\ No newline at end of file
projects/configs/datasets/custom_nus-3d.py
0 → 100644
View file @
4cd43886
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
50
,
-
50
,
-
5
,
50
,
50
,
3
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'trailer'
,
'bus'
,
'construction_vehicle'
,
'bicycle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'barrier'
]
dataset_type
=
'NuScenesDataset_eval_modified'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
False
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
),
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
-
0.3925
,
0.3925
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
,
flip_ratio_bev_horizontal
=
0.5
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'PointShuffle'
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
0
,
0
],
scale_ratio_range
=
[
1.
,
1.
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
ann_file
=
data_root
+
'nuscenes_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation
=
dict
(
interval
=
24
,
pipeline
=
eval_pipeline
)
projects/configs/datasets/custom_waymo-3d.py
0 → 100644
View file @
4cd43886
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type
=
'CustomWaymoDataset'
data_root
=
'data/waymo/kitti_format/'
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
class_names
=
[
'Car'
,
'Pedestrian'
,
'Cyclist'
]
point_cloud_range
=
[
-
74.88
,
-
74.88
,
-
2
,
74.88
,
74.88
,
4
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
)
db_sampler
=
dict
(
data_root
=
data_root
,
info_path
=
data_root
+
'waymo_dbinfos_train.pkl'
,
rate
=
1.0
,
prepare
=
dict
(
filter_by_difficulty
=
[
-
1
],
filter_by_min_points
=
dict
(
Car
=
5
,
Pedestrian
=
10
,
Cyclist
=
10
)),
classes
=
class_names
,
sample_groups
=
dict
(
Car
=
15
,
Pedestrian
=
10
,
Cyclist
=
10
),
points_loader
=
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
[
0
,
1
,
2
,
3
,
4
],
file_client_args
=
file_client_args
))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1920
,
1280
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
'RepeatDataset'
,
times
=
2
,
dataset
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_train.pkl'
,
split
=
'training'
,
pipeline
=
train_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
False
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
,
# load one frame every five frames
load_interval
=
5
)),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_val.pkl'
,
split
=
'training'
,
pipeline
=
test_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_val.pkl'
,
split
=
'training'
,
pipeline
=
test_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
))
evaluation
=
dict
(
interval
=
24
,
pipeline
=
test_pipeline
)
\ No newline at end of file
projects/mmdet3d_plugin/__init__.py
0 → 100644
View file @
4cd43886
from
.core.bbox.assigners.hungarian_assigner_3d
import
HungarianAssigner3D
from
.core.bbox.coders.nms_free_coder
import
NMSFreeCoder
from
.core.bbox.match_costs
import
BBox3DL1Cost
from
.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
.datasets.pipelines
import
(
PhotoMetricDistortionMultiViewImage
,
PadMultiViewImage
,
NormalizeMultiviewImage
,
CustomCollect3D
)
from
.models.utils
import
*
from
.models.opt.adamw
import
AdamW2
from
.bevformer
import
*
from
.dd3d
import
*
projects/mmdet3d_plugin/bevformer/__init__.py
0 → 100644
View file @
4cd43886
from
.dense_heads
import
*
from
.detectors
import
*
from
.modules
import
*
from
.runner
import
*
from
.hooks
import
*
projects/mmdet3d_plugin/bevformer/apis/__init__.py
0 → 100644
View file @
4cd43886
from
.train
import
custom_train_model
from
.mmdet_train
import
custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
0 → 100644
View file @
4cd43886
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
from
mmcv.runner
import
Hook
class
ProfilerHook
(
Hook
):
def
__init__
(
self
,
profiler
,
total_steps
):
self
.
profiler
=
profiler
self
.
total_steps
=
total_steps
# 总步数 (wait + warmup + active) * repeat
self
.
stopped
=
False
def
after_train_iter
(
self
,
runner
):
# if not self.stopped:
# self.profiler.step()
# 检测是否完成所有schedule步骤
if
self
.
profiler
.
step_num
==
self
.
total_steps
-
1
and
not
self
.
stopped
:
# 停止Profiler
self
.
profiler
.
stop
()
self
.
stopped
=
True
# 只在rank 0上打印结果
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
# 获取并打印关键指标
# table = self.profiler.key_averages().table(
# sort_by="self_cuda_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
# table = self.profiler.key_averages().table(
# sort_by="self_cpu_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
results
=
self
.
profiler
.
key_averages
().
table
(
sort_by
=
"cuda_time_total"
)
log_file
=
"./BW_log_step{}.txt"
.
format
(
self
.
total_steps
)
with
open
(
log_file
,
mode
=
'w'
)
as
file
:
file
.
write
(
str
(
results
))
# self.profiler.start()
if
not
self
.
stopped
:
self
.
profiler
.
step
()
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
# cfg.gpus will be ignored if distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
for
ds
in
dataset
]
# put model on gpus
if
distributed
:
print
(
"============================distributed yes================================================="
)
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
),
# model.cuda(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
),
# model.cuda(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
print
(
"============================distributed no================================================="
)
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
# Support batch_size > 1 in validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
if
cfg
.
get
(
'enable_profiler'
,
False
):
# 创建profiler配置
total_steps
=
(
1
+
20
+
15
)
*
1
# 22 steps
profiler
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
1
,
# 跳过前1个step
warmup
=
20
,
# 预热1个step(不计入结果)
active
=
1
,
# 分析3个step
repeat
=
1
# 只执行一轮
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
# f"{cfg.work_dir}/profiler_logs" # 输出目录
"/home/BEVFormer/profiler_logs"
# "./profiler_logs"
),
with_stack
=
True
,
# 收集调用栈信息
profile_memory
=
False
,
# 分析内存使用
record_shapes
=
False
# 记录张量形状
)
# 创建并注册ProfilerHook
# profiler_hook = ProfilerHook(profiler)
profiler_hook
=
ProfilerHook
(
profiler
,
total_steps
)
runner
.
register_hook
(
profiler_hook
)
# 启动profiler
profiler
.
start
()
print
(
"==================================== profiler.start()==================================================================="
)
try
:
# 运行训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
finally
:
# 确保profiler停止
profiler
.
stop
()
else
:
# 正常训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
# runner.run(data_loaders, cfg.workflow)
projects/mmdet3d_plugin/bevformer/apis/mmdet_train_nhwc.py
0 → 100644
View file @
4cd43886
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
from
mmcv.runner
import
Hook
class
ProfilerHook
(
Hook
):
def
__init__
(
self
,
profiler
):
self
.
profiler
=
profiler
def
after_train_iter
(
self
,
runner
):
self
.
profiler
.
step
()
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# 添加channels_last转换
to_channels_last
=
True
)
for
ds
in
dataset
]
model
=
model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
)
if
eval_model
is
not
None
:
eval_model
=
eval_model
.
to
(
device
=
'cuda'
,
memory_format
=
torch
.
channels_last
)
# put model on gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
model
=
MMDistributedDataParallel
(
model
,
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
,
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
,
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
,
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# 添加channels_last转换
to_channels_last
=
True
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
if
cfg
.
get
(
'enable_profiler'
,
False
):
# 创建profiler配置
profiler
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
1
,
# 跳过前1个step
warmup
=
30
,
# 预热1个step(不计入结果)
active
=
3
,
# 分析3个step
repeat
=
1
# 只执行一轮
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
# f"{cfg.work_dir}/profiler_logs" # 输出目录
"/home/BEVFormer/profiler_logs"
# "./profiler_logs"
),
with_stack
=
True
,
# 收集调用栈信息
profile_memory
=
True
,
# 分析内存使用
record_shapes
=
True
# 记录张量形状
)
# 创建并注册ProfilerHook
profiler_hook
=
ProfilerHook
(
profiler
)
runner
.
register_hook
(
profiler_hook
)
# 启动profiler
profiler
.
start
()
print
(
"==================================== profiler.start()==================================================================="
)
try
:
# 运行训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
finally
:
# 确保profiler停止
profiler
.
stop
()
else
:
# 正常训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
# runner.run(data_loaders, cfg.workflow)
projects/mmdet3d_plugin/bevformer/apis/mmdet_train_profiler.py
0 → 100644
View file @
4cd43886
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
from
mmcv.runner
import
Hook
class
ProfilerHook
(
Hook
):
def
__init__
(
self
,
profiler
):
self
.
profiler
=
profiler
def
after_train_iter
(
self
,
runner
):
self
.
profiler
.
step
()
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
# cfg.gpus will be ignored if distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
for
ds
in
dataset
]
# put model on gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
# Support batch_size > 1 in validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
if
cfg
.
get
(
'enable_profiler'
,
False
):
# 创建profiler配置
profiler
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
1
,
# 跳过前1个step
warmup
=
30
,
# 预热1个step(不计入结果)
active
=
3
,
# 分析3个step
repeat
=
1
# 只执行一轮
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
# f"{cfg.work_dir}/profiler_logs" # 输出目录
"/home/SparseDrive/profiler_logs"
# "./profiler_logs"
),
with_stack
=
True
,
# 收集调用栈信息
profile_memory
=
True
,
# 分析内存使用
record_shapes
=
True
# 记录张量形状
)
# 创建并注册ProfilerHook
profiler_hook
=
ProfilerHook
(
profiler
)
runner
.
register_hook
(
profiler_hook
)
# 启动profiler
profiler
.
start
()
print
(
"==================================== profiler.start()==================================================================="
)
try
:
# 运行训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
finally
:
# 确保profiler停止
profiler
.
stop
()
else
:
# 正常训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
# runner.run(data_loaders, cfg.workflow)
projects/mmdet3d_plugin/bevformer/apis/test.py
0 → 100644
View file @
4cd43886
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
os.path
as
osp
import
pickle
import
shutil
import
tempfile
import
time
import
mmcv
import
torch
import
torch.distributed
as
dist
from
mmcv.image
import
tensor2imgs
from
mmcv.runner
import
get_dist_info
from
mmdet.core
import
encode_mask_results
import
mmcv
import
numpy
as
np
import
pycocotools.mask
as
mask_util
def
custom_encode_mask_results
(
mask_results
):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms
=
mask_results
num_classes
=
len
(
cls_segms
)
encoded_mask_results
=
[]
for
i
in
range
(
len
(
cls_segms
)):
encoded_mask_results
.
append
(
mask_util
.
encode
(
np
.
array
(
cls_segms
[
i
][:,
:,
np
.
newaxis
],
order
=
'F'
,
dtype
=
'uint8'
))[
0
])
# encoded with RLE
return
[
encoded_mask_results
]
def
custom_multi_gpu_test
(
model
,
data_loader
,
tmpdir
=
None
,
gpu_collect
=
False
):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model
.
eval
()
bbox_results
=
[]
mask_results
=
[]
dataset
=
data_loader
.
dataset
rank
,
world_size
=
get_dist_info
()
if
rank
==
0
:
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
time
.
sleep
(
2
)
# This line can prevent deadlock problem in some cases.
have_mask
=
False
for
i
,
data
in
enumerate
(
data_loader
):
with
torch
.
no_grad
():
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
# encode mask results
if
isinstance
(
result
,
dict
):
if
'bbox_results'
in
result
.
keys
():
bbox_result
=
result
[
'bbox_results'
]
batch_size
=
len
(
result
[
'bbox_results'
])
bbox_results
.
extend
(
bbox_result
)
if
'mask_results'
in
result
.
keys
()
and
result
[
'mask_results'
]
is
not
None
:
mask_result
=
custom_encode_mask_results
(
result
[
'mask_results'
])
mask_results
.
extend
(
mask_result
)
have_mask
=
True
else
:
batch_size
=
len
(
result
)
bbox_results
.
extend
(
result
)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if
rank
==
0
:
for
_
in
range
(
batch_size
*
world_size
):
prog_bar
.
update
()
# collect results from all ranks
if
gpu_collect
:
bbox_results
=
collect_results_gpu
(
bbox_results
,
len
(
dataset
))
if
have_mask
:
mask_results
=
collect_results_gpu
(
mask_results
,
len
(
dataset
))
else
:
mask_results
=
None
else
:
bbox_results
=
collect_results_cpu
(
bbox_results
,
len
(
dataset
),
tmpdir
)
tmpdir
=
tmpdir
+
'_mask'
if
tmpdir
is
not
None
else
None
if
have_mask
:
mask_results
=
collect_results_cpu
(
mask_results
,
len
(
dataset
),
tmpdir
)
else
:
mask_results
=
None
if
mask_results
is
None
:
return
bbox_results
return
{
'bbox_results'
:
bbox_results
,
'mask_results'
:
mask_results
}
def
collect_results_cpu
(
result_part
,
size
,
tmpdir
=
None
):
rank
,
world_size
=
get_dist_info
()
# create a tmp dir if it is not specified
if
tmpdir
is
None
:
MAX_LEN
=
512
# 32 is whitespace
dir_tensor
=
torch
.
full
((
MAX_LEN
,
),
32
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
if
rank
==
0
:
mmcv
.
mkdir_or_exist
(
'.dist_test'
)
tmpdir
=
tempfile
.
mkdtemp
(
dir
=
'.dist_test'
)
tmpdir
=
torch
.
tensor
(
bytearray
(
tmpdir
.
encode
()),
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
dir_tensor
[:
len
(
tmpdir
)]
=
tmpdir
dist
.
broadcast
(
dir_tensor
,
0
)
tmpdir
=
dir_tensor
.
cpu
().
numpy
().
tobytes
().
decode
().
rstrip
()
else
:
mmcv
.
mkdir_or_exist
(
tmpdir
)
# dump the part result to the dir
mmcv
.
dump
(
result_part
,
osp
.
join
(
tmpdir
,
f
'part_
{
rank
}
.pkl'
))
dist
.
barrier
()
# collect all parts
if
rank
!=
0
:
return
None
else
:
# load results of all parts from tmp dir
part_list
=
[]
for
i
in
range
(
world_size
):
part_file
=
osp
.
join
(
tmpdir
,
f
'part_
{
i
}
.pkl'
)
part_list
.
append
(
mmcv
.
load
(
part_file
))
# sort the results
ordered_results
=
[]
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for
res
in
part_list
:
ordered_results
.
extend
(
list
(
res
))
# the dataloader may pad some samples
ordered_results
=
ordered_results
[:
size
]
# remove tmp dir
shutil
.
rmtree
(
tmpdir
)
return
ordered_results
def
collect_results_gpu
(
result_part
,
size
):
collect_results_cpu
(
result_part
,
size
)
\ No newline at end of file
projects/mmdet3d_plugin/bevformer/apis/train.py
0 → 100644
View file @
4cd43886
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.mmdet_train
import
custom_train_detector
from
mmseg.apis
import
train_segmentor
from
mmdet.apis
import
train_detector
def
custom_train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
assert
False
else
:
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
eval_model
=
eval_model
,
meta
=
meta
)
def
train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
train_segmentor
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
else
:
train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
Prev
1
2
3
4
5
6
7
8
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment