Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
TS-MODELS-OPT
training
Autonomous-Driving-models
Commits
19472568
"tests/TestMonteCarloFlexibleBarostat.h" did not exist on "01269257047f880c124e294452a94632b6dd88e7"
Commit
19472568
authored
Apr 08, 2026
by
雍大凯
Browse files
将子模块转换为普通目录
parent
51e55208
Changes
233
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4711 additions
and
0 deletions
+4711
-0
docker-hub/MapTRv2/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py
...apTR/projects/configs/_base_/schedules/seg_cosine_150e.py
+9
-0
docker-hub/MapTRv2/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py
...apTR/projects/configs/_base_/schedules/seg_cosine_200e.py
+9
-0
docker-hub/MapTRv2/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py
...MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py
+9
-0
docker-hub/MapTRv2/MapTR/projects/configs/bevformer/bevformer_base.py
...apTRv2/MapTR/projects/configs/bevformer/bevformer_base.py
+257
-0
docker-hub/MapTRv2/MapTR/projects/configs/bevformer/bevformer_small.py
...pTRv2/MapTR/projects/configs/bevformer/bevformer_small.py
+268
-0
docker-hub/MapTRv2/MapTR/projects/configs/bevformer/bevformer_tiny.py
...apTRv2/MapTR/projects/configs/bevformer/bevformer_tiny.py
+270
-0
docker-hub/MapTRv2/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
...TR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
+272
-0
docker-hub/MapTRv2/MapTR/projects/configs/datasets/custom_lyft-3d.py
...MapTRv2/MapTR/projects/configs/datasets/custom_lyft-3d.py
+136
-0
docker-hub/MapTRv2/MapTR/projects/configs/datasets/custom_nus-3d.py
.../MapTRv2/MapTR/projects/configs/datasets/custom_nus-3d.py
+141
-0
docker-hub/MapTRv2/MapTR/projects/configs/datasets/custom_waymo-3d.py
...apTRv2/MapTR/projects/configs/datasets/custom_waymo-3d.py
+112
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py
...pTRv2/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py
+312
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py
...Rv2/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py
+342
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py
...pTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py
+310
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py
...apTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py
+310
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py
...TR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py
+308
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py
...apTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py
+290
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py
...v2/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py
+315
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_av2_3d_r50_6ep.py
.../MapTR/projects/configs/maptrv2/maptrv2_av2_3d_r50_6ep.py
+350
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_av2_3d_r50_6ep_w_centerline.py
...ts/configs/maptrv2/maptrv2_av2_3d_r50_6ep_w_centerline.py
+350
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_nusc_r50_24ep.py
...2/MapTR/projects/configs/maptrv2/maptrv2_nusc_r50_24ep.py
+341
-0
No files found.
docker-hub/MapTRv2/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py
0 → 100644
View file @
19472568
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer
=
dict
(
type
=
'SGD'
,
lr
=
0.2
,
weight_decay
=
0.0001
,
momentum
=
0.9
)
optimizer_config
=
dict
(
grad_clip
=
None
)
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
None
,
min_lr
=
0.002
)
momentum_config
=
None
# runtime settings
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
150
)
docker-hub/MapTRv2/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py
0 → 100644
View file @
19472568
# optimizer
# This schedule is mainly used on ScanNet dataset in segmentation task
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
,
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
None
)
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
None
,
min_lr
=
1e-5
)
momentum_config
=
None
# runtime settings
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
200
)
docker-hub/MapTRv2/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py
0 → 100644
View file @
19472568
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
,
weight_decay
=
0.001
)
optimizer_config
=
dict
(
grad_clip
=
None
)
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
None
,
min_lr
=
1e-5
)
momentum_config
=
None
# runtime settings
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
50
)
docker-hub/MapTRv2/MapTR/projects/configs/bevformer/bevformer_base.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
4
bev_h_
=
200
bev_w_
=
200
queue_length
=
4
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'BEVFormer'
,
use_grid_mask
=
True
,
video_test_mode
=
True
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
101
,
num_stages
=
4
,
out_indices
=
(
1
,
2
,
3
),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN2d'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'caffe'
,
dcn
=
dict
(
type
=
'DCNv2'
,
deform_groups
=
1
,
fallback_on_stride
=
False
),
# original DCNv2 will print log when perform load_state_dict
stage_with_dcn
=
(
False
,
False
,
True
,
True
)),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
,
1024
,
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
4
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
6
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.25
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBox3DL1Cost'
,
weight
=
0.25
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
load_from
=
'ckpts/r101_dcn_fcos3d_pretrain.pth'
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/bevformer/bevformer_small.py
0 → 100644
View file @
19472568
# BEvFormer-small consumes at lease 10500M GPU memory
# compared to bevformer_base, bevformer_small has
# smaller BEV: 200*200 -> 150*150
# less encoder layers: 6 -> 3
# smaller input size: 1600*900 -> (1600*900)*0.8
# multi-scale feautres -> single scale features (C5)
# with_cp of backbone = True
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
bev_h_
=
150
bev_w_
=
150
queue_length
=
3
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'BEVFormer'
,
use_grid_mask
=
True
,
video_test_mode
=
True
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
101
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN2d'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'caffe'
,
with_cp
=
True
,
# using checkpoint to save GPU memory
dcn
=
dict
(
type
=
'DCNv2'
,
deform_groups
=
1
,
fallback_on_stride
=
False
),
# original DCNv2 will print log when perform load_state_dict
stage_with_dcn
=
(
False
,
False
,
True
,
True
)),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.25
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBox3DL1Cost'
,
weight
=
0.25
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.8
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
# dict(type='PadMultiViewImage', size_divisor=32),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.8
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
load_from
=
'ckpts/r101_dcn_fcos3d_pretrain.pth'
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/bevformer/bevformer_tiny.py
0 → 100644
View file @
19472568
# BEvFormer-tiny consumes at lease 6700M GPU memory
# compared to bevformer_base, bevformer_tiny has
# smaller backbone: R101-DCN -> R50
# smaller BEV: 200*200 -> 50*50
# less encoder layers: 6 -> 3
# smaller input size: 1600*900 -> 800*450
# multi-scale feautres -> single scale features (C5)
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
bev_h_
=
50
bev_w_
=
50
queue_length
=
3
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'BEVFormer'
,
use_grid_mask
=
True
,
video_test_mode
=
True
,
pretrained
=
dict
(
img
=
'torchvision://resnet50'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.25
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBox3DL1Cost'
,
weight
=
0.25
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
1
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
0 → 100644
View file @
19472568
# BEvFormer-tiny consumes at lease 6700M GPU memory
# compared to bevformer_base, bevformer_tiny has
# smaller backbone: R101-DCN -> R50
# smaller BEV: 200*200 -> 50*50
# less encoder layers: 6 -> 3
# smaller input size: 1600*900 -> 800*450
# multi-scale feautres -> single scale features (C5)
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
voxel_size
=
[
0.2
,
0.2
,
8
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
bev_h_
=
50
bev_w_
=
50
queue_length
=
3
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'BEVFormer_fp16'
,
use_grid_mask
=
True
,
video_test_mode
=
True
,
pretrained
=
dict
(
img
=
'torchvision://resnet50'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'BEVFormerHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_classes
=
10
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
transformer
=
dict
(
type
=
'PerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
3
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetectionTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'NMSFreeCoder'
,
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
pc_range
=
point_cloud_range
,
max_num
=
300
,
voxel_size
=
voxel_size
,
num_classes
=
10
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.25
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'HungarianAssigner3D'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBox3DL1Cost'
,
weight
=
0.25
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
weight
=
0.0
),
# Fake cost. This is just to make it compatible with DETR head.
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2.8e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
)
runner
=
dict
(
type
=
'EpochBasedRunner_video'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
custom_hooks
=
[
dict
(
type
=
'TransferWeight'
,
priority
=
'LOWEST'
)]
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/configs/datasets/custom_lyft-3d.py
0 → 100644
View file @
19472568
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
80
,
-
80
,
-
5
,
80
,
80
,
3
]
# For Lyft we usually do 9-class detection
class_names
=
[
'car'
,
'truck'
,
'bus'
,
'emergency_vehicle'
,
'other_vehicle'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'animal'
]
dataset_type
=
'CustomLyftDataset'
data_root
=
'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
False
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/lyft/': 's3://lyft/lyft/',
# 'data/lyft/': 's3://lyft/lyft/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
),
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
-
0.3925
,
0.3925
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
,
flip_ratio_bev_horizontal
=
0.5
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'PointShuffle'
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
0
,
0
],
scale_ratio_range
=
[
1.
,
1.
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
]
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
2
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'lyft_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation
=
dict
(
interval
=
24
,
pipeline
=
eval_pipeline
)
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/configs/datasets/custom_nus-3d.py
0 → 100644
View file @
19472568
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range
=
[
-
50
,
-
50
,
-
5
,
50
,
50
,
3
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'trailer'
,
'bus'
,
'construction_vehicle'
,
'bicycle'
,
'motorcycle'
,
'pedestrian'
,
'traffic_cone'
,
'barrier'
]
dataset_type
=
'NuScenesDataset_eval_modified'
data_root
=
'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
False
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
),
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
-
0.3925
,
0.3925
],
scale_ratio_range
=
[
0.95
,
1.05
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
,
flip_ratio_bev_horizontal
=
0.5
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'PointShuffle'
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'GlobalRotScaleTrans'
,
rot_range
=
[
0
,
0
],
scale_ratio_range
=
[
1.
,
1.
],
translation_std
=
[
0
,
0
,
0
]),
dict
(
type
=
'RandomFlip3D'
),
dict
(
type
=
'PointsRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline
=
[
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'LoadPointsFromMultiSweeps'
,
sweeps_num
=
10
,
file_client_args
=
file_client_args
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
ann_file
=
data_root
+
'nuscenes_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_val.pkl'
,
pipeline
=
test_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation
=
dict
(
interval
=
24
,
pipeline
=
eval_pipeline
)
docker-hub/MapTRv2/MapTR/projects/configs/datasets/custom_waymo-3d.py
0 → 100644
View file @
19472568
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type
=
'CustomWaymoDataset'
data_root
=
'data/waymo/kitti_format/'
file_client_args
=
dict
(
backend
=
'disk'
)
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
img_norm_cfg
=
dict
(
mean
=
[
103.530
,
116.280
,
123.675
],
std
=
[
1.0
,
1.0
,
1.0
],
to_rgb
=
False
)
class_names
=
[
'Car'
,
'Pedestrian'
,
'Cyclist'
]
point_cloud_range
=
[
-
74.88
,
-
74.88
,
-
2
,
74.88
,
74.88
,
4
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
)
db_sampler
=
dict
(
data_root
=
data_root
,
info_path
=
data_root
+
'waymo_dbinfos_train.pkl'
,
rate
=
1.0
,
prepare
=
dict
(
filter_by_difficulty
=
[
-
1
],
filter_by_min_points
=
dict
(
Car
=
5
,
Pedestrian
=
10
,
Cyclist
=
10
)),
classes
=
class_names
,
sample_groups
=
dict
(
Car
=
15
,
Pedestrian
=
10
,
Cyclist
=
10
),
points_loader
=
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
[
0
,
1
,
2
,
3
,
4
],
file_client_args
=
file_client_args
))
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1920
,
1280
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
data
=
dict
(
samples_per_gpu
=
2
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
'RepeatDataset'
,
times
=
2
,
dataset
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_train.pkl'
,
split
=
'training'
,
pipeline
=
train_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
False
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
,
# load one frame every five frames
load_interval
=
5
)),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_val.pkl'
,
split
=
'training'
,
pipeline
=
test_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'waymo_infos_val.pkl'
,
split
=
'training'
,
pipeline
=
test_pipeline
,
modality
=
input_modality
,
classes
=
class_names
,
test_mode
=
True
,
box_type_3d
=
'LiDAR'
))
evaluation
=
dict
(
interval
=
24
,
pipeline
=
test_pipeline
)
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
2.0
,
15.0
,
30.0
,
2.0
]
voxel_size
=
[
0.15
,
0.15
,
4
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
80
bev_w_
=
40
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet18-f37072fd.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
18
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'SyncBN'
,
requires_grad
=
True
),
norm_eval
=
False
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
512
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
100
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
1
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'GeometrySptialCrossAttention'
,
pc_range
=
point_cloud_range
,
attention
=
dict
(
type
=
'GeometryKernelAttention'
,
embed_dims
=
_dim_
,
num_heads
=
4
,
dilation
=
1
,
kernel_size
=
(
3
,
5
),
num_levels
=
_num_levels_
,
im2col_step
=
192
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
2
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
4
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
,
im2col_step
=
192
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.2
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.2
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
24
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
4e-3
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
50
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
110
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
5
)
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
2.0
,
15.0
,
30.0
,
2.0
]
lidar_point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
5.0
,
15.0
,
30.0
,
3.0
]
voxel_size
=
[
0.1
,
0.1
,
0.2
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
True
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
modality
=
'fusion'
,
lidar_encoder
=
dict
(
voxelize
=
dict
(
max_num_points
=
10
,
point_cloud_range
=
lidar_point_cloud_range
,
voxel_size
=
voxel_size
,
max_voxels
=
[
90000
,
120000
]),
backbone
=
dict
(
type
=
'SparseEncoder'
,
in_channels
=
5
,
sparse_shape
=
[
300
,
600
,
41
],
output_channels
=
128
,
order
=
(
'conv'
,
'norm'
,
'act'
),
encoder_channels
=
((
16
,
16
,
32
),
(
32
,
32
,
64
),
(
64
,
64
,
128
),
(
128
,
128
)),
encoder_paddings
=
([
0
,
0
,
1
],
[
0
,
0
,
1
],
[
0
,
0
,
[
1
,
1
,
0
]],
[
0
,
0
]),
block_type
=
'basicblock'
),
),
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
50
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
modality
=
'fusion'
,
fuser
=
dict
(
type
=
'ConvFuser'
,
in_channels
=
[
_dim_
,
256
],
out_channels
=
_dim_
,
),
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
1
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'GeometrySptialCrossAttention'
,
pc_range
=
point_cloud_range
,
attention
=
dict
(
type
=
'GeometryKernelAttention'
,
embed_dims
=
_dim_
,
num_heads
=
4
,
dilation
=
1
,
kernel_size
=
(
3
,
5
),
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
reduce_beams
=
32
load_dim
=
5
use_dim
=
5
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'CustomLoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
load_dim
,
use_dim
=
use_dim
,
reduce_beams
=
reduce_beams
),
dict
(
type
=
'CustomLoadPointsFromMultiSweeps'
,
sweeps_num
=
9
,
load_dim
=
load_dim
,
use_dim
=
use_dim
,
reduce_beams
=
reduce_beams
,
pad_empty_sweeps
=
True
,
remove_close
=
True
),
dict
(
type
=
'CustomPointsRangeFilter'
,
point_cloud_range
=
lidar_point_cloud_range
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
,
'points'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'CustomLoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
load_dim
,
use_dim
=
use_dim
,
reduce_beams
=
reduce_beams
),
dict
(
type
=
'CustomLoadPointsFromMultiSweeps'
,
sweeps_num
=
9
,
load_dim
=
load_dim
,
use_dim
=
use_dim
,
reduce_beams
=
reduce_beams
,
pad_empty_sweeps
=
True
,
remove_close
=
True
),
dict
(
type
=
'CustomPointsRangeFilter'
,
point_cloud_range
=
lidar_point_cloud_range
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'points'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
find_unused_parameters
=
True
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
2.0
,
15.0
,
30.0
,
2.0
]
voxel_size
=
[
0.15
,
0.15
,
4
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
50
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
1
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'GeometrySptialCrossAttention'
,
pc_range
=
point_cloud_range
,
attention
=
dict
(
type
=
'GeometryKernelAttention'
,
embed_dims
=
_dim_
,
num_heads
=
4
,
dilation
=
1
,
kernel_size
=
(
3
,
5
),
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
110
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
5
)
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
2.0
,
15.0
,
30.0
,
2.0
]
voxel_size
=
[
0.15
,
0.15
,
4
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
50
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
1
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'GeometrySptialCrossAttention'
,
pc_range
=
point_cloud_range
,
attention
=
dict
(
type
=
'GeometryKernelAttention'
,
embed_dims
=
_dim_
,
num_heads
=
4
,
dilation
=
1
,
kernel_size
=
(
3
,
5
),
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
2.0
,
15.0
,
30.0
,
2.0
]
voxel_size
=
[
0.15
,
0.15
,
4
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
50
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
1
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'SpatialCrossAttention'
,
pc_range
=
point_cloud_range
,
deformable_attention
=
dict
(
type
=
'MSDeformableAttention3D'
,
embed_dims
=
_dim_
,
num_points
=
8
,
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
10.0
,
15.0
,
30.0
,
10.0
]
voxel_size
=
[
0.15
,
0.15
,
20.0
]
dbound
=
[
1.0
,
35.0
,
0.5
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
50
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'LSSTransform'
,
in_channels
=
_dim_
,
out_channels
=
_dim_
,
feat_down_sample
=
32
,
pc_range
=
point_cloud_range
,
voxel_size
=
voxel_size
,
dbound
=
dbound
,
downsample
=
2
),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'LoadAnnotations3D'
,
with_bbox_3d
=
True
,
with_label_3d
=
True
,
with_attr_label
=
False
),
dict
(
type
=
'ObjectRangeFilter'
,
point_cloud_range
=
point_cloud_range
),
dict
(
type
=
'ObjectNameFilter'
,
classes
=
class_names
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'gt_bboxes_3d'
,
'gt_labels_3d'
,
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
30.0
,
-
15.0
,
-
2.0
,
30.0
,
15.0
,
2.0
]
voxel_size
=
[
0.15
,
0.15
,
4
]
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
100
bev_w_
=
200
queue_length
=
1
# each sequence contains `queue_length` frames.
model
=
dict
(
type
=
'MapTR'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRHead'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec
=
50
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
num_cams
=
7
,
encoder
=
dict
(
type
=
'BEVFormerEncoder'
,
num_layers
=
1
,
pc_range
=
point_cloud_range
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
transformerlayers
=
dict
(
type
=
'BEVFormerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'TemporalSelfAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
dict
(
type
=
'GeometrySptialCrossAttention'
,
pc_range
=
point_cloud_range
,
num_cams
=
7
,
attention
=
dict
(
type
=
'GeometryKernelAttention'
,
embed_dims
=
_dim_
,
num_heads
=
4
,
dilation
=
1
,
kernel_size
=
(
3
,
5
),
num_levels
=
_num_levels_
),
embed_dims
=
_dim_
,
)
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
35
,
-
20
,
-
35
,
-
20
,
35
,
20
,
35
,
20
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
)),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomAV2LocalMapDataset'
data_root
=
'data/argoverse2/sensor/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
,
padding
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.3
]),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
,
padding
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.3
]),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
2048
,
2048
),
# 2048*0.3, 2048*0.3
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
6
,
workers_per_gpu
=
8
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_val.pkl'
,
# ann_file=data_root + 'av2_map_infos_train.pkl',
map_ann_file
=
data_root
+
'av2_map_anns_val.json'
,
load_interval
=
4
,
# av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
# load_interval=1, # TODO debug
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_val.pkl'
,
# ann_file=data_root + 'av2_map_infos_train.pkl',
map_ann_file
=
data_root
+
'av2_map_anns_val.json'
,
load_interval
=
4
,
# av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
# load_interval=1, # TODO debug
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_av2_3d_r50_6ep.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
30.0
,
-
15.0
,
-
5.0
,
30.0
,
15.0
,
3.0
]
voxel_size
=
[
0.15
,
0.15
,
8.0
]
dbound
=
[
1.0
,
35.0
,
0.5
]
grid_config
=
{
'x'
:
[
-
30.0
,
-
30.0
,
0.15
],
# useless
'y'
:
[
-
15.0
,
-
15.0
,
0.15
],
# useless
'z'
:
[
-
10
,
10
,
20
],
# useless
'depth'
:
[
1.0
,
35.0
,
0.5
],
# useful
}
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
num_vec
=
50
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
100
bev_w_
=
200
queue_length
=
1
# each sequence contains `queue_length` frames.
aux_seg_cfg
=
dict
(
use_aux_seg
=
True
,
bev_seg
=
True
,
pv_seg
=
True
,
seg_classes
=
1
,
feat_down_sample
=
32
,
pv_thickness
=
1
,
)
z_cfg
=
dict
(
pred_z_flag
=
True
,
gt_z_flag
=
True
,
)
model
=
dict
(
type
=
'MapTRv2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRv2Head'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec_one2one
=
num_vec
,
num_vec_one2many
=
300
,
k_one2many
=
6
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
3
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
aux_seg
=
aux_seg_cfg
,
z_cfg
=
z_cfg
,
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
num_cams
=
7
,
z_cfg
=
z_cfg
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'LSSTransform'
,
in_channels
=
_dim_
,
out_channels
=
_dim_
,
feat_down_sample
=
32
,
pc_range
=
point_cloud_range
,
voxel_size
=
voxel_size
,
dbound
=
dbound
,
downsample
=
2
,
loss_depth_weight
=
3.0
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
with_cp
=
False
,
aspp_mid_channels
=
96
),
grid_config
=
grid_config
,),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DecoupledDetrTransformerDecoderLayer'
,
num_vec
=
num_vec
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
z_cfg
=
z_cfg
,
post_center_range
=
[
-
35
,
-
20
,
-
35
,
-
20
,
35
,
20
,
35
,
20
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
),
loss_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
4.0
,
loss_weight
=
1.0
),
loss_pv_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
1.0
,
loss_weight
=
2.0
),),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
z_cfg
=
z_cfg
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomAV2OfflineLocalMapDataset'
data_root
=
'data/argoverse2/sensor/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
,
padding
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.3
]),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
,
padding
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.3
]),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
2048
,
2048
),
# 2048*0.3, 2048*0.3
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_train.pkl'
,
z_cfg
=
z_cfg
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
aux_seg
=
aux_seg_cfg
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_val.pkl'
,
map_ann_file
=
data_root
+
'av2_gt_map_anns_val.json'
,
# code_size=3,
z_cfg
=
z_cfg
,
load_interval
=
4
,
# av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
# load_interval=1, # TODO debug
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_val.pkl'
,
map_ann_file
=
data_root
+
'av2_gt_map_anns_val.json'
,
# code_size=3,
z_cfg
=
z_cfg
,
load_interval
=
4
,
# av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
# load_interval=1, # TODO debug
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
6
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
find_unused_parameters
=
True
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_av2_3d_r50_6ep_w_centerline.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
30.0
,
-
15.0
,
-
5.0
,
30.0
,
15.0
,
3.0
]
voxel_size
=
[
0.15
,
0.15
,
8.0
]
dbound
=
[
1.0
,
35.0
,
0.5
]
grid_config
=
{
'x'
:
[
-
30.0
,
-
30.0
,
0.15
],
# useless
'y'
:
[
-
15.0
,
-
15.0
,
0.15
],
# useless
'z'
:
[
-
10
,
10
,
20
],
# useless
'depth'
:
[
1.0
,
35.0
,
0.5
],
# useful
}
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
,
'centerline'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
num_vec
=
70
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
100
bev_w_
=
200
queue_length
=
1
# each sequence contains `queue_length` frames.
aux_seg_cfg
=
dict
(
use_aux_seg
=
True
,
bev_seg
=
True
,
pv_seg
=
True
,
seg_classes
=
1
,
feat_down_sample
=
32
,
pv_thickness
=
1
,
)
z_cfg
=
dict
(
pred_z_flag
=
True
,
gt_z_flag
=
True
,
)
model
=
dict
(
type
=
'MapTRv2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRv2Head'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec_one2one
=
num_vec
,
num_vec_one2many
=
300
,
k_one2many
=
6
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
3
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
aux_seg
=
aux_seg_cfg
,
z_cfg
=
z_cfg
,
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
num_cams
=
7
,
z_cfg
=
z_cfg
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'LSSTransform'
,
in_channels
=
_dim_
,
out_channels
=
_dim_
,
feat_down_sample
=
32
,
pc_range
=
point_cloud_range
,
voxel_size
=
voxel_size
,
dbound
=
dbound
,
downsample
=
2
,
loss_depth_weight
=
3.0
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
with_cp
=
False
,
aspp_mid_channels
=
96
),
grid_config
=
grid_config
,),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DecoupledDetrTransformerDecoderLayer'
,
num_vec
=
num_vec
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
z_cfg
=
z_cfg
,
post_center_range
=
[
-
35
,
-
20
,
-
35
,
-
20
,
35
,
20
,
35
,
20
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
),
loss_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
4.0
,
loss_weight
=
1.0
),
loss_pv_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
1.0
,
loss_weight
=
2.0
),),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
z_cfg
=
z_cfg
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomAV2OfflineLocalMapDataset'
data_root
=
'data/argoverse2/sensor/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
,
padding
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.3
]),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
]
test_pipeline
=
[
dict
(
type
=
'CustomLoadMultiViewImageFromFiles'
,
to_float32
=
True
,
padding
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.3
]),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
2048
,
2048
),
# 2048*0.3, 2048*0.3
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_train.pkl'
,
z_cfg
=
z_cfg
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
aux_seg
=
aux_seg_cfg
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_val.pkl'
,
map_ann_file
=
data_root
+
'av2_gt_map_anns_val.json'
,
# code_size=3,
z_cfg
=
z_cfg
,
load_interval
=
4
,
# av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
# load_interval=1, # TODO debug
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'av2_map_infos_val.pkl'
,
map_ann_file
=
data_root
+
'av2_gt_map_anns_val.json'
,
# code_size=3,
z_cfg
=
z_cfg
,
load_interval
=
4
,
# av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
# load_interval=1, # TODO debug
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
6
evaluation
=
dict
(
interval
=
1
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
)
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
interval
=
1
)
find_unused_parameters
=
True
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_nusc_r50_24ep.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
10.0
,
15.0
,
30.0
,
10.0
]
voxel_size
=
[
0.15
,
0.15
,
20.0
]
dbound
=
[
1.0
,
35.0
,
0.5
]
grid_config
=
{
'x'
:
[
-
30.0
,
-
30.0
,
0.15
],
# useless
'y'
:
[
-
15.0
,
-
15.0
,
0.15
],
# useless
'z'
:
[
-
10
,
10
,
20
],
# useless
'depth'
:
[
1.0
,
35.0
,
0.5
],
# useful
}
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
num_vec
=
50
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
aux_seg_cfg
=
dict
(
use_aux_seg
=
True
,
bev_seg
=
True
,
pv_seg
=
True
,
seg_classes
=
1
,
feat_down_sample
=
32
,
pv_thickness
=
1
,
)
model
=
dict
(
type
=
'MapTRv2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRv2Head'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec_one2one
=
50
,
num_vec_one2many
=
300
,
k_one2many
=
6
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
aux_seg
=
aux_seg_cfg
,
# z_cfg=z_cfg,
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'LSSTransform'
,
in_channels
=
_dim_
,
out_channels
=
_dim_
,
feat_down_sample
=
32
,
pc_range
=
point_cloud_range
,
voxel_size
=
voxel_size
,
dbound
=
dbound
,
downsample
=
2
,
loss_depth_weight
=
3.0
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
with_cp
=
False
,
aspp_mid_channels
=
96
),
grid_config
=
grid_config
,),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DecoupledDetrTransformerDecoderLayer'
,
num_vec
=
num_vec
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
),
loss_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
4.0
,
loss_weight
=
1.0
),
loss_pv_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
1.0
,
loss_weight
=
2.0
),),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesOfflineLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'CustomPointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'PadMultiViewImageDepth'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'gt_depth'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
8
,
workers_per_gpu
=
32
,
# 最优
#samples_per_gpu=12,
#workers_per_gpu=48, # TODO
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_map_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
aux_seg
=
aux_seg_cfg
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_map_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_map_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
#type='Miopen_AdamW',
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
1
#24
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
,
save_best
=
'NuscMap_chamfer/mAP'
,
rule
=
'greater'
)
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
1
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
max_keep_ckpts
=
1
,
interval
=
2
)
find_unused_parameters
=
True
Prev
1
2
3
4
5
6
7
8
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment