Commit 007f2e68 authored by 雍大凯's avatar 雍大凯
Browse files

将子模块转换为普通目录

parent 19472568
# mAP: 0.3805
# mATE: 0.7198
# mASE: 0.2805
# mAOE: 0.4131
# mAVE: 0.7652
# mAAE: 0.1951
# NDS: 0.4529
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (0,)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": True,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='GlobalRotScaleTransImage',
rot_range=[-22.5, 22.5],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
reverse_angle=True,
training=True,
flip_dx_ratio=0.5,
flip_dy_ratio=0.5,
only_gt=True,),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead_GroupDETR',
group_detr=group_detr,
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='GroupMultiheadAttention',
group=group_detr,
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[20, ])
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# mAP: 0.3953
# mATE: 0.6941
# mASE: 0.2765
# mAOE: 0.4199
# mAVE: 0.7537
# mAAE: 0.1866
# NDS: 0.4646
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (0,)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": True,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='GlobalRotScaleTransImage',
rot_range=[-22.5, 22.5],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
reverse_angle=True,
training=True,
flip_dx_ratio=0.5,
flip_dy_ratio=0.5,
only_gt=True,),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead_GroupDETR',
group_detr=group_detr,
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='GroupMultiheadAttention',
group=group_detr,
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[44, ])
total_epochs = 48
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# mAP: 0.3512
# mATE: 0.7534
# mASE: 0.2863
# mAOE: 0.4665
# mAVE: 0.8070
# mAAE: 0.1861
# NDS: 0.4257
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (0,)
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead',
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[20, ])
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# mAP: 0.3594
# mATE: 0.7327
# mASE: 0.2814
# mAOE: 0.4074
# mAVE: 0.7831
# mAAE: 0.1983
# NDS: 0.4394
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (0,)
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead',
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[44, ])
total_epochs = 48
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# mAP: 0.4199
# mATE: 0.6689
# mASE: 0.2814
# mAOE: 0.3915
# mAVE: 0.3834
# mAAE: 0.1928
# NDS: 0.5182
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (-1, 0,)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": True,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='GlobalRotScaleTransImage',
rot_range=[-22.5, 22.5],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
reverse_angle=True,
training=True,
flip_dx_ratio=0.5,
flip_dy_ratio=0.5,
only_gt=True,),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead_GroupDETR',
group_detr=group_detr,
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='GroupMultiheadAttention',
group=group_detr,
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[20, ])
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# mAP: 0.4313
# mATE: 0.6557
# mASE: 0.2775
# mAOE: 0.3851
# mAVE: 0.3861
# mAAE: 0.1882
# NDS: 0.5264
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (-1, 0,)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": True,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='GlobalRotScaleTransImage',
rot_range=[-22.5, 22.5],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
reverse_angle=True,
training=True,
flip_dx_ratio=0.5,
flip_dy_ratio=0.5,
only_gt=True,),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead_GroupDETR',
group_detr=group_detr,
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='GroupMultiheadAttention',
group=group_detr,
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[20, ])
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# mAP: 0.4600
# mATE: 0.6185
# mASE: 0.2815
# mAOE: 0.3660
# mAVE: 0.3157
# mAAE: 0.1902
# NDS: 0.5528
_base_ = [
'../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (-7,-6,-5,-4,-3,-2,-1,0)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
"reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": True,
}
ida_aug_conf_eval = {
"reisze": [640, ],
"crop": (0, 260, 1600, 900),
"H": 900,
"W": 1600,
"rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='GlobalRotScaleTransImage',
rot_range=[-22.5, 22.5],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
reverse_angle=True,
training=True,
flip_dx_ratio=0.5,
flip_dy_ratio=0.5,
only_gt=True,),
dict(
type='ObjectRangeFilter',
point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=class_names),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
dict(type='DD3DMapper',
is_train=True,
tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 640),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D',
keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
'lidar2ego_rotation', 'timestamp'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
persistent_workers=True,
train=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
box_type_3d='LiDAR',
mono_cfg=dict(
name='nusc_trainval',
data_root='data/nuscenes/',
min_num_lidar_points=3,
min_box_visibility=0.2)),
val=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality,
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDatasetV2',
frames=frames,
data_root='data/nuscenes/',
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
pipeline=eval_pipeline,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)
# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5
model = dict(
type='BEVFormerV2',
use_grid_mask=True,
video_test_mode=False,
num_levels=_num_levels_,
num_mono_levels=_num_mono_levels_,
mono_loss_weight=1.0,
frames=frames,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN'),
norm_eval=False,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=_num_mono_levels_,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead_GroupDETR',
group_detr=group_detr,
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_classes=10,
in_channels=_dim_,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformerV2',
embed_dims=_dim_,
frames=frames,
inter_channels=_dim_*2,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=point_cloud_range,
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=_dim_,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=point_cloud_range,
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=_dim_,
num_points=8,
num_levels=4),
embed_dims=_dim_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='GroupMultiheadAttention',
group=group_detr,
embed_dims=_dim_,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=_dim_,
num_levels=1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=_pos_dim_,
row_num_embed=bev_h_,
col_num_embed=bev_w_),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
fcos3d_bbox_head=dict(
type='NuscenesDD3D',
num_classes=10,
in_channels=_dim_,
strides=[8, 16, 32, 64, 128],
box3d_on=True,
feature_locations_offset='none',
fcos2d_cfg=dict(
num_cls_convs=4,
num_box_convs=4,
norm='SyncBN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0),
fcos2d_loss_cfg=dict(
focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
fcos3d_cfg=dict(
num_convs=4,
norm='SyncBN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
fcos3d_loss_cfg=dict(
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
[0.61416006, 1.7016163, 1.3054738],
[2.9139307, 10.725025, 3.2832346],
[1.9751819, 4.641267, 1.74352],
[2.772134, 6.565072, 3.2474296],
[0.7800532, 2.138673, 1.4437162],
[0.6667362, 0.7181772, 1.7616143],
[0.40246472, 0.4027083, 1.0084083],
[3.0059454, 12.8197, 4.1213827],
[2.4986045, 6.9310856, 2.8382742]]),
target_assign_cfg=dict(
center_sample=True,
pos_radius=1.5,
sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, 100000000.0))),
nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='SmoothL1Cost', weight=0.75),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=point_cloud_range))))
# optimizer
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys=dict(
img_backbone=dict(lr_mult=0.5),
)),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000,
warmup_ratio=1.0 / 3,
step=[20, ])
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-80, -80, -5, 80, 80, 3]
# For Lyft we usually do 9-class detection
class_names = [
'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
'bicycle', 'pedestrian', 'animal'
]
dataset_type = 'CustomLyftDataset'
data_root = 'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=True)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/lyft/': 's3://lyft/lyft/',
# 'data/lyft/': 's3://lyft/lyft/'
# }))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
dataset_type = 'NuScenesDataset_eval_modified'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR'),
val=dict(
type=dataset_type,
ann_file=data_root + 'nuscenes_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type = 'CustomWaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
class_names = ['Car', 'Pedestrian', 'Cyclist']
point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
input_modality = dict(use_lidar=False, use_camera=True)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'waymo_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
classes=class_names,
sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
]
test_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1920, 1280),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='CustomCollect3D', keys=['img'])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_train.pkl',
split='training',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
# load one frame every five frames
load_interval=5)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'))
evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D
from .core.bbox.coders.nms_free_coder import NMSFreeCoder
from .core.bbox.match_costs import BBox3DL1Cost
from .core.evaluation.eval_hooks import CustomDistEvalHook
from .datasets.pipelines import (
PhotoMetricDistortionMultiViewImage, PadMultiViewImage,
NormalizeMultiviewImage, CustomCollect3D)
from .models.utils import *
from .models.opt.adamw import AdamW2
from .bevformer import *
from .dd3d import *
from .dense_heads import *
from .detectors import *
from .modules import *
from .runner import *
from .hooks import *
from .train import custom_train_model
from .mmdet_train import custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import random
import warnings
import numpy as np
import torch
import torch.distributed as dist
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
Fp16OptimizerHook, OptimizerHook, build_optimizer,
build_runner, get_dist_info)
from mmcv.utils import build_from_cfg
from mmdet.core import EvalHook
from mmdet.datasets import (build_dataset,
replace_ImageToTensor)
from mmdet.utils import get_root_logger
import time
import os.path as osp
from projects.mmdet3d_plugin.datasets.builder import build_dataloader
from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook
from projects.mmdet3d_plugin.datasets import custom_build_dataset
from mmcv.runner import Hook
class ProfilerHook(Hook):
def __init__(self, profiler, total_steps):
self.profiler = profiler
self.total_steps = total_steps # 总步数 (wait + warmup + active) * repeat
self.stopped = False
def after_train_iter(self, runner):
if self.profiler.step_num == self.total_steps and not self.stopped:
# 停止Profiler
self.profiler.stop()
self.stopped = True
# 只在rank 0上打印结果
rank, _ = get_dist_info()
if rank == 0:
# 获取并打印关键指标
# table = self.profiler.key_averages().table(
# sort_by="self_cuda_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
# table = self.profiler.key_averages().table(
# sort_by="self_cpu_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
results = self.profiler.key_averages().table(sort_by="cuda_time_total")
log_file = "/workspace/BEVFormer/profiler_logs/BW_log_step{}.txt".format(self.total_steps)
with open(log_file, mode='w') as file:
file.write(str(results))
# self.profiler.start()
if not self.stopped:
self.profiler.step()
def custom_train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
logger = get_root_logger(cfg.log_level)
# prepare data loaders
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
#assert len(dataset)==1s
if 'imgs_per_gpu' in cfg.data:
logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead')
if 'samples_per_gpu' in cfg.data:
logger.warning(
f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
f'={cfg.data.imgs_per_gpu} is used in this experiments')
else:
logger.warning(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f'{cfg.data.imgs_per_gpu} in this experiments')
cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
data_loaders = [
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
# cfg.gpus will be ignored if distributed
len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
) for ds in dataset
]
# put model on gpus
if distributed:
find_unused_parameters = cfg.get('find_unused_parameters', False)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model = MMDistributedDataParallel(
model.to(device='cuda', memory_format=torch.channels_last),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
if eval_model is not None:
eval_model = MMDistributedDataParallel(
eval_model.to(device='cuda', memory_format=torch.channels_last),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
if eval_model is not None:
eval_model = MMDataParallel(
eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# build runner
optimizer = build_optimizer(model, cfg.optimizer)
if 'runner' not in cfg:
cfg.runner = {
'type': 'EpochBasedRunner',
'max_epochs': cfg.total_epochs
}
warnings.warn(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.', UserWarning)
else:
if 'total_epochs' in cfg:
assert cfg.total_epochs == cfg.runner.max_epochs
if eval_model is not None:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
eval_model=eval_model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
else:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp
# fp16 setting
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
elif distributed and 'type' not in cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)
else:
optimizer_config = cfg.optimizer_config
# register hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config,
cfg.get('momentum_config', None))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if distributed:
if isinstance(runner, EpochBasedRunner):
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if validate:
# Support batch_size > 1 in validation
val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
if val_samples_per_gpu > 1:
assert False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.val.pipeline = replace_ImageToTensor(
cfg.data.val.pipeline)
val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
val_dataloader = build_dataloader(
val_dataset,
samples_per_gpu=val_samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
)
eval_cfg = cfg.get('evaluation', {})
eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
eval_hook = CustomDistEvalHook if distributed else EvalHook
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
# user-defined hooks
if cfg.get('custom_hooks', None):
custom_hooks = cfg.custom_hooks
assert isinstance(custom_hooks, list), \
f'custom_hooks expect list type, but got {type(custom_hooks)}'
for hook_cfg in cfg.custom_hooks:
assert isinstance(hook_cfg, dict), \
'Each item in custom_hooks expects dict type, but got ' \
f'{type(hook_cfg)}'
hook_cfg = hook_cfg.copy()
priority = hook_cfg.pop('priority', 'NORMAL')
hook = build_from_cfg(hook_cfg, HOOKS)
runner.register_hook(hook, priority=priority)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
if False:
# 创建profiler配置
total_steps = (1 + 20 + 1) * 1 # 22 steps
profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA
],
schedule=torch.profiler.schedule(
wait=1, # 跳过前1个step
warmup=20, # 预热1个step(不计入结果)
active=1, # 分析3个step
repeat=1 # 只执行一轮
),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
# f"{cfg.work_dir}/profiler_logs" # 输出目录
"/workspace/BEVFormer/profiler_logs"
# "./profiler_logs"
),
with_stack=True, # 收集调用栈信息
profile_memory=False, # 分析内存使用
record_shapes=False # 记录张量形状
)
# 创建并注册ProfilerHook
# profiler_hook = ProfilerHook(profiler)
profiler_hook = ProfilerHook(profiler,total_steps)
runner.register_hook(profiler_hook)
# 启动profiler
profiler.start()
print("==================================== profiler.start()===================================================================")
try:
# 运行训练
runner.run(data_loaders, cfg.workflow)
finally:
# 确保profiler停止
profiler.stop()
else:
# 正常训练
runner.run(data_loaders, cfg.workflow)
# runner.run(data_loaders, cfg.workflow)
#runner.run(data_loaders, cfg.workflow)
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import random
import warnings
import numpy as np
import torch
import torch.distributed as dist
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
Fp16OptimizerHook, OptimizerHook, build_optimizer,
build_runner, get_dist_info)
from mmcv.utils import build_from_cfg
from mmdet.core import EvalHook
from mmdet.datasets import (build_dataset,
replace_ImageToTensor)
from mmdet.utils import get_root_logger
import time
import os.path as osp
from projects.mmdet3d_plugin.datasets.builder import build_dataloader
from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook
from projects.mmdet3d_plugin.datasets import custom_build_dataset
def custom_train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
logger = get_root_logger(cfg.log_level)
# prepare data loaders
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
#assert len(dataset)==1s
if 'imgs_per_gpu' in cfg.data:
logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead')
if 'samples_per_gpu' in cfg.data:
logger.warning(
f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
f'={cfg.data.imgs_per_gpu} is used in this experiments')
else:
logger.warning(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f'{cfg.data.imgs_per_gpu} in this experiments')
cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
data_loaders = [
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
# cfg.gpus will be ignored if distributed
len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
) for ds in dataset
]
# put model on gpus
if distributed:
find_unused_parameters = cfg.get('find_unused_parameters', False)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model = MMDistributedDataParallel(
model.to(device='cuda', memory_format=torch.channels_last),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
if eval_model is not None:
eval_model = MMDistributedDataParallel(
eval_model.to(device='cuda', memory_format=torch.channels_last),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
if eval_model is not None:
eval_model = MMDataParallel(
eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# build runner
optimizer = build_optimizer(model, cfg.optimizer)
if 'runner' not in cfg:
cfg.runner = {
'type': 'EpochBasedRunner',
'max_epochs': cfg.total_epochs
}
warnings.warn(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.', UserWarning)
else:
if 'total_epochs' in cfg:
assert cfg.total_epochs == cfg.runner.max_epochs
if eval_model is not None:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
eval_model=eval_model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
else:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp
# fp16 setting
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
elif distributed and 'type' not in cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)
else:
optimizer_config = cfg.optimizer_config
# register hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config,
cfg.get('momentum_config', None))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if distributed:
if isinstance(runner, EpochBasedRunner):
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if validate:
# Support batch_size > 1 in validation
val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
if val_samples_per_gpu > 1:
assert False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.val.pipeline = replace_ImageToTensor(
cfg.data.val.pipeline)
val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
val_dataloader = build_dataloader(
val_dataset,
samples_per_gpu=val_samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
)
eval_cfg = cfg.get('evaluation', {})
eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
eval_hook = CustomDistEvalHook if distributed else EvalHook
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
# user-defined hooks
if cfg.get('custom_hooks', None):
custom_hooks = cfg.custom_hooks
assert isinstance(custom_hooks, list), \
f'custom_hooks expect list type, but got {type(custom_hooks)}'
for hook_cfg in cfg.custom_hooks:
assert isinstance(hook_cfg, dict), \
'Each item in custom_hooks expects dict type, but got ' \
f'{type(hook_cfg)}'
hook_cfg = hook_cfg.copy()
priority = hook_cfg.pop('priority', 'NORMAL')
hook = build_from_cfg(hook_cfg, HOOKS)
runner.register_hook(hook, priority=priority)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow)
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import os.path as osp
import pickle
import shutil
import tempfile
import time
import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.core import encode_mask_results
import mmcv
import numpy as np
import pycocotools.mask as mask_util
def custom_encode_mask_results(mask_results):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms = mask_results
num_classes = len(cls_segms)
encoded_mask_results = []
for i in range(len(cls_segms)):
encoded_mask_results.append(
mask_util.encode(
np.array(
cls_segms[i][:, :, np.newaxis], order='F',
dtype='uint8'))[0]) # encoded with RLE
return [encoded_mask_results]
def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model = model.to(memory_format=torch.channels_last)
model.eval()
bbox_results = []
mask_results = []
dataset = data_loader.dataset
rank, world_size = get_dist_info()
if rank == 0:
prog_bar = mmcv.ProgressBar(len(dataset))
time.sleep(2) # This line can prevent deadlock problem in some cases.
have_mask = False
'''
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
# encode mask results
if isinstance(result, dict):
if 'bbox_results' in result.keys():
bbox_result = result['bbox_results']
batch_size = len(result['bbox_results'])
bbox_results.extend(bbox_result)
if 'mask_results' in result.keys() and result['mask_results'] is not None:
mask_result = custom_encode_mask_results(result['mask_results'])
mask_results.extend(mask_result)
have_mask = True
else:
batch_size = len(result)
bbox_results.extend(result)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if rank == 0:
for _ in range(batch_size * world_size):
prog_bar.update()
'''
from torch.profiler import profile, record_function, ProfilerActivity
# 1. 初始化 Profiler
prof = torch.profiler.profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=10, warmup=10, active=2, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('/workspace/BEVFormer/profiler_logs/'),
record_shapes=True,
with_stack=True
)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
prof.start() # 开启 Profiler
for i, data in enumerate(data_loader):
# 记录 GPU 开始时间 (异步,不阻塞)
start_event.record()
with torch.no_grad():
# Profiler 作用域:标记这部分为 Inference
with record_function("model_inference"):
result = model(return_loss=False, rescale=True, **data)
if isinstance(result, dict):
if 'bbox_results' in result.keys():
bbox_result = result['bbox_results']
batch_size = len(result['bbox_results'])
bbox_results.extend(bbox_result)
if 'mask_results' in result.keys() and result['mask_results'] is not None:
mask_result = custom_encode_mask_results(result['mask_results'])
mask_results.extend(mask_result)
have_mask = True
else:
batch_size = len(result)
bbox_results.extend(result)
# 记录 GPU 结束时间 (异步,不阻塞)
end_event.record()
# 性能分析器步进
prof.step()
# 3. 策略性打印:每 50 轮计算一次,减少同步带来的开销
if rank == 0 and i % 20 == 0 and i > 0:
torch.cuda.synchronize()
# 计算的是最近一轮的时间,虽然有同步,但频率低了 20 倍
iter_time = start_event.elapsed_time(end_event) / 1000.0
print(f"[Iter {i}] Latency: {iter_time:.4f}s | FPS: {1/iter_time:.2f}")
if rank == 0:
for _ in range(batch_size * world_size):
prog_bar.update()
prof.stop() # 停止 Profiler
#start_event = torch.cuda.Event(enable_timing=True)
#end_event = torch.cuda.Event(enable_timing=True)
#
#for i, data in enumerate(data_loader):
# start_event.record()
#
# with torch.no_grad():
# result = model(return_loss=False, rescale=True, **data)
# # 原有逻辑不动
# if isinstance(result, dict):
# if 'bbox_results' in result.keys():
# bbox_result = result['bbox_results']
# batch_size = len(result['bbox_results'])
# bbox_results.extend(bbox_result)
# if 'mask_results' in result.keys() and result['mask_results'] is not None:
# mask_result = custom_encode_mask_results(result['mask_results'])
# mask_results.extend(mask_result)
# have_mask = True
# else:
# batch_size = len(result)
# bbox_results.extend(result)
#
# end_event.record()
#
# if rank == 0:
# torch.cuda.synchronize() # ⚠️ 只在这里同步一次(影响极小)
# iter_time = start_event.elapsed_time(end_event) / 1000.0
# print(f"[Iter {i}] time: {iter_time:.4f}s, FPS: {1/iter_time:.2f}")
#
# for _ in range(batch_size * world_size):
# prog_bar.update()
# collect results from all ranks
if gpu_collect:
bbox_results = collect_results_gpu(bbox_results, len(dataset))
if have_mask:
mask_results = collect_results_gpu(mask_results, len(dataset))
else:
mask_results = None
else:
bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
tmpdir = tmpdir+'_mask' if tmpdir is not None else None
if have_mask:
mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
else:
mask_results = None
if mask_results is None:
return bbox_results
return {'bbox_results': bbox_results, 'mask_results': mask_results}
def collect_results_cpu(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
mmcv.mkdir_or_exist('.dist_test')
tmpdir = tempfile.mkdtemp(dir='.dist_test')
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for res in part_list:
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
def collect_results_gpu(result_part, size):
collect_results_cpu(result_part, size)
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from .mmdet_train import custom_train_detector
from mmseg.apis import train_segmentor
from mmdet.apis import train_detector
def custom_train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if cfg.model.type in ['EncoderDecoder3D']:
assert False
else:
custom_train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
eval_model=eval_model,
meta=meta)
def train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
meta=None):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if cfg.model.type in ['EncoderDecoder3D']:
train_segmentor(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
else:
train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
from .bevformer_head import BEVFormerHead, BEVFormerHead_GroupDETR
from .bev_head import BEVHead
import copy
from re import I
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Linear, bias_init_with_prob
from mmcv.utils import TORCH_VERSION, digit_version
from mmdet.core import (multi_apply, multi_apply, reduce_mean)
from mmdet.models.utils.transformer import inverse_sigmoid
from mmdet.models import HEADS
from mmdet.models.dense_heads import DETRHead
from mmdet3d.core.bbox.coders import build_bbox_coder
from traitlets import import_item
from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmcv.runner import BaseModule, force_fp32
from projects.mmdet3d_plugin.models.utils.bricks import run_time
import numpy as np
import mmcv
import cv2 as cv
from projects.mmdet3d_plugin.bevformer.modules import PerceptionTransformerBEVEncoder
from mmdet.models.utils import build_transformer
from mmdet3d.models.builder import build_head
from mmdet3d.models.dense_heads.free_anchor3d_head import FreeAnchor3DHead
@HEADS.register_module()
class BEVHead(BaseModule):
def __init__(self,
bev_h,
bev_w,
pc_range,
embed_dims,
transformer,
positional_encoding: dict,
pts_bbox_head_3d: dict,
init_cfg=None,
**kwargs,
):
super(BEVHead, self).__init__(init_cfg=init_cfg)
self.bev_h = bev_h
self.bev_w = bev_w
self.embed_dims = embed_dims
self.pc_range = pc_range
self.fp16_enabled = False
self.transformer :PerceptionTransformerBEVEncoder = build_transformer(transformer)
self.positional_encoding = build_positional_encoding(positional_encoding)
pts_bbox_head_3d.update(kwargs)
self.pts_bbox_head_3d = build_head(pts_bbox_head_3d)
self.real_w = self.pc_range[3] - self.pc_range[0]
self.real_h = self.pc_range[4] - self.pc_range[1]
self._init_layers()
def init_weights(self):
"""Initialize weights of the Multi View BEV Encoder"""
self.transformer.init_weights()
def _init_layers(self):
"""Initialize classification branch and regression branch of head."""
self.bev_embedding = nn.Embedding(self.bev_h * self.bev_w, self.embed_dims)
@force_fp32(apply_to=('mlvl_feats', 'pred_bev'))
def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False):
bs, num_cam, _, _, _ = mlvl_feats[0].shape
dtype = mlvl_feats[0].dtype
bev_queries = self.bev_embedding.weight.to(dtype)
bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
device=bev_queries.device).to(dtype)
bev_pos = self.positional_encoding(bev_mask).to(dtype)
bev_embed = self.transformer(
mlvl_feats,
bev_queries,
self.bev_h,
self.bev_w,
grid_length=(self.real_h / self.bev_h,
self.real_w / self.bev_w),
bev_pos=bev_pos,
img_metas=img_metas,
prev_bev=prev_bev,
)
if only_bev:
return bev_embed
bev_feature = bev_embed.permute(0, 2, 1).reshape(bs, self.embed_dims, self.bev_h, self.bev_w)
ret = {}
ret['pred'] = self.pts_bbox_head_3d([bev_feature,])
if not self.training:
ret['bev_embed'] = bev_embed
return ret
@force_fp32(apply_to=('ret'))
def loss(self,
gt_bboxes_list,
gt_labels_list,
ret,
gt_bboxes_ignore=None,
img_metas=None):
assert gt_bboxes_ignore is None
return self.pts_bbox_head_3d.loss(gt_bboxes_list, gt_labels_list, ret['pred'], gt_bboxes_ignore=gt_bboxes_ignore, img_metas=img_metas)
@force_fp32(apply_to=('ret'))
def get_bboxes(self, ret, img_metas, rescale=False):
return self.pts_bbox_head_3d.get_bboxes(ret['pred'], img_metas)
@HEADS.register_module()
class FreeAnchor3DHeadV2(FreeAnchor3DHead):
@force_fp32(apply_to=('pred'))
def loss(self,
gt_bboxes_list,
gt_labels_list,
pred,
gt_bboxes_ignore=None,
img_metas=None):
cls_scores, bbox_preds, dir_cls_preds = pred
return super().loss(cls_scores, bbox_preds, dir_cls_preds, gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore)
@force_fp32(apply_to=('pred'))
def get_bboxes(self, pred, img_metas, rescale=False):
cls_scores, bbox_preds, dir_cls_preds = pred
return super().get_bboxes(
cls_scores,
bbox_preds,
dir_cls_preds,
img_metas,
cfg=None,
rescale=rescale)
\ No newline at end of file
import copy
import torch
import torch.nn as nn
from mmcv.cnn import Linear, bias_init_with_prob
from mmcv.utils import TORCH_VERSION, digit_version
from mmdet.core import (multi_apply, multi_apply, reduce_mean)
from mmdet.models.utils.transformer import inverse_sigmoid
from mmdet.models import HEADS
from mmdet.models.dense_heads import DETRHead
from mmdet3d.core.bbox.coders import build_bbox_coder
from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
from mmcv.runner import force_fp32, auto_fp16
@HEADS.register_module()
class BEVFormerHead(DETRHead):
"""Head of Detr3D.
Args:
with_box_refine (bool): Whether to refine the reference points
in the decoder. Defaults to False.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
transformer (obj:`ConfigDict`): ConfigDict is used for building
the Encoder and Decoder.
bev_h, bev_w (int): spatial shape of BEV queries.
"""
def __init__(self,
*args,
with_box_refine=False,
as_two_stage=False,
transformer=None,
bbox_coder=None,
num_cls_fcs=2,
code_weights=None,
bev_h=30,
bev_w=30,
**kwargs):
self.bev_h = bev_h
self.bev_w = bev_w
self.fp16_enabled = False
self.with_box_refine = with_box_refine
self.as_two_stage = as_two_stage
if self.as_two_stage:
transformer['as_two_stage'] = self.as_two_stage
if 'code_size' in kwargs:
self.code_size = kwargs['code_size']
else:
self.code_size = 10
if code_weights is not None:
self.code_weights = code_weights
else:
self.code_weights = [1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
self.bbox_coder = build_bbox_coder(bbox_coder)
self.pc_range = self.bbox_coder.pc_range
self.real_w = self.pc_range[3] - self.pc_range[0]
self.real_h = self.pc_range[4] - self.pc_range[1]
self.num_cls_fcs = num_cls_fcs - 1
super(BEVFormerHead, self).__init__(
*args, transformer=transformer, **kwargs)
self.code_weights = nn.Parameter(torch.tensor(
self.code_weights, requires_grad=False), requires_grad=False)
def _init_layers(self):
"""Initialize classification branch and regression branch of head."""
cls_branch = []
for _ in range(self.num_reg_fcs):
cls_branch.append(Linear(self.embed_dims, self.embed_dims))
cls_branch.append(nn.LayerNorm(self.embed_dims))
cls_branch.append(nn.ReLU(inplace=True))
cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
fc_cls = nn.Sequential(*cls_branch)
reg_branch = []
for _ in range(self.num_reg_fcs):
reg_branch.append(Linear(self.embed_dims, self.embed_dims))
reg_branch.append(nn.ReLU())
reg_branch.append(Linear(self.embed_dims, self.code_size))
reg_branch = nn.Sequential(*reg_branch)
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred = (self.transformer.decoder.num_layers + 1) if \
self.as_two_stage else self.transformer.decoder.num_layers
if self.with_box_refine:
self.cls_branches = _get_clones(fc_cls, num_pred)
self.reg_branches = _get_clones(reg_branch, num_pred)
else:
self.cls_branches = nn.ModuleList(
[fc_cls for _ in range(num_pred)])
self.reg_branches = nn.ModuleList(
[reg_branch for _ in range(num_pred)])
if not self.as_two_stage:
self.bev_embedding = nn.Embedding(
self.bev_h * self.bev_w, self.embed_dims)
self.query_embedding = nn.Embedding(self.num_query,
self.embed_dims * 2)
def init_weights(self):
"""Initialize weights of the DeformDETR head."""
self.transformer.init_weights()
if self.loss_cls.use_sigmoid:
bias_init = bias_init_with_prob(0.01)
for m in self.cls_branches:
nn.init.constant_(m[-1].bias, bias_init)
@auto_fp16(apply_to=('mlvl_feats'))
def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 5D-tensor with shape
(B, N, C, H, W).
prev_bev: previous bev featues
only_bev: only compute BEV features with encoder.
Returns:
all_cls_scores (Tensor): Outputs from the classification head, \
shape [nb_dec, bs, num_query, cls_out_channels]. Note \
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression \
head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
Shape [nb_dec, bs, num_query, 9].
"""
bs, num_cam, _, _, _ = mlvl_feats[0].shape
dtype = mlvl_feats[0].dtype
object_query_embeds = self.query_embedding.weight.to(dtype)
bev_queries = self.bev_embedding.weight.to(dtype)
bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
device=bev_queries.device).to(dtype)
bev_pos = self.positional_encoding(bev_mask).to(dtype)
if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround
return self.transformer.get_bev_features(
mlvl_feats,
bev_queries,
self.bev_h,
self.bev_w,
grid_length=(self.real_h / self.bev_h,
self.real_w / self.bev_w),
bev_pos=bev_pos,
img_metas=img_metas,
prev_bev=prev_bev,
)
else:
outputs = self.transformer(
mlvl_feats,
bev_queries,
object_query_embeds,
self.bev_h,
self.bev_w,
grid_length=(self.real_h / self.bev_h,
self.real_w / self.bev_w),
bev_pos=bev_pos,
reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
cls_branches=self.cls_branches if self.as_two_stage else None,
img_metas=img_metas,
prev_bev=prev_bev
)
bev_embed, hs, init_reference, inter_references = outputs
hs = hs.permute(0, 2, 1, 3)
outputs_classes = []
outputs_coords = []
for lvl in range(hs.shape[0]):
if lvl == 0:
reference = init_reference
else:
reference = inter_references[lvl - 1]
reference = inverse_sigmoid(reference)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp = self.reg_branches[lvl](hs[lvl])
# TODO: check the shape of reference
assert reference.shape[-1] == 3
tmp[..., 0:2] += reference[..., 0:2]
tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
tmp[..., 4:5] += reference[..., 2:3]
tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
self.pc_range[0]) + self.pc_range[0])
tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
self.pc_range[1]) + self.pc_range[1])
tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
self.pc_range[2]) + self.pc_range[2])
# TODO: check if using sigmoid
outputs_coord = tmp
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
outputs_classes = torch.stack(outputs_classes)
outputs_coords = torch.stack(outputs_coords)
outs = {
'bev_embed': bev_embed,
'all_cls_scores': outputs_classes,
'all_bbox_preds': outputs_coords,
'enc_cls_scores': None,
'enc_bbox_preds': None,
}
return outs
def _get_target_single(self,
cls_score,
bbox_pred,
gt_labels,
gt_bboxes,
gt_bboxes_ignore=None):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes = bbox_pred.size(0)
# assigner and sampler
gt_c = gt_bboxes.shape[-1]
assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
gt_labels, gt_bboxes_ignore)
sampling_result = self.sampler.sample(assign_result, bbox_pred,
gt_bboxes)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
# label targets
labels = gt_bboxes.new_full((num_bboxes,),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)
# bbox targets
bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
bbox_weights = torch.zeros_like(bbox_pred)
bbox_weights[pos_inds] = 1.0
# DETR
bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
return (labels, label_weights, bbox_targets, bbox_weights,
pos_inds, neg_inds)
def get_targets(self,
cls_scores_list,
bbox_preds_list,
gt_bboxes_list,
gt_labels_list,
gt_bboxes_ignore_list=None):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all \
images.
- bbox_targets_list (list[Tensor]): BBox targets for all \
images.
- bbox_weights_list (list[Tensor]): BBox weights for all \
images.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
assert gt_bboxes_ignore_list is None, \
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs = len(cls_scores_list)
gt_bboxes_ignore_list = [
gt_bboxes_ignore_list for _ in range(num_imgs)
]
(labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
self._get_target_single, cls_scores_list, bbox_preds_list,
gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def loss_single(self,
cls_scores,
bbox_preds,
gt_bboxes_list,
gt_labels_list,
gt_bboxes_ignore_list=None):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs = cls_scores.size(0)
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
gt_bboxes_list, gt_labels_list,
gt_bboxes_ignore_list)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
cls_avg_factor = max(cls_avg_factor, 1)
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# regression L1 loss
bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
bbox_weights = bbox_weights * self.code_weights
loss_bbox = self.loss_bbox(
bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan,
:10], bbox_weights[isnotnan, :10],
avg_factor=num_total_pos)
if digit_version(TORCH_VERSION) >= digit_version('1.8'):
loss_cls = torch.nan_to_num(loss_cls)
loss_bbox = torch.nan_to_num(loss_bbox)
return loss_cls, loss_bbox
@force_fp32(apply_to=('preds_dicts'))
def loss(self,
gt_bboxes_list,
gt_labels_list,
preds_dicts,
gt_bboxes_ignore=None,
img_metas=None):
""""Loss function.
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
all_cls_scores = preds_dicts['all_cls_scores']
all_bbox_preds = preds_dicts['all_bbox_preds']
enc_cls_scores = preds_dicts['enc_cls_scores']
enc_bbox_preds = preds_dicts['enc_bbox_preds']
num_dec_layers = len(all_cls_scores)
device = gt_labels_list[0].device
gt_bboxes_list = [torch.cat(
(gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
dim=1).to(device) for gt_bboxes in gt_bboxes_list]
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
losses_cls, losses_bbox = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list,
all_gt_bboxes_ignore_list)
loss_dict = dict()
# loss of proposal generated from encode feature map.
if enc_cls_scores is not None:
binary_labels_list = [
torch.zeros_like(gt_labels_list[i])
for i in range(len(all_gt_labels_list))
]
enc_loss_cls, enc_losses_bbox = \
self.loss_single(enc_cls_scores, enc_bbox_preds,
gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
loss_dict['enc_loss_cls'] = enc_loss_cls
loss_dict['enc_loss_bbox'] = enc_losses_bbox
# loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
losses_bbox[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
num_dec_layer += 1
return loss_dict
@force_fp32(apply_to=('preds_dicts'))
def get_bboxes(self, preds_dicts, img_metas, rescale=False):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
preds_dicts = self.bbox_coder.decode(preds_dicts)
num_samples = len(preds_dicts)
ret_list = []
for i in range(num_samples):
preds = preds_dicts[i]
bboxes = preds['bboxes']
bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
code_size = bboxes.shape[-1]
bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
scores = preds['scores']
labels = preds['labels']
ret_list.append([bboxes, scores, labels])
return ret_list
@HEADS.register_module()
class BEVFormerHead_GroupDETR(BEVFormerHead):
def __init__(self,
*args,
group_detr=1,
**kwargs):
self.group_detr = group_detr
assert 'num_query' in kwargs
kwargs['num_query'] = group_detr * kwargs['num_query']
super().__init__(*args, **kwargs)
def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False):
bs, num_cam, _, _, _ = mlvl_feats[0].shape
dtype = mlvl_feats[0].dtype
object_query_embeds = self.query_embedding.weight.to(dtype)
if not self.training: # NOTE: Only difference to bevformer head
object_query_embeds = object_query_embeds[:self.num_query // self.group_detr]
bev_queries = self.bev_embedding.weight.to(dtype)
bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
device=bev_queries.device).to(dtype)
bev_pos = self.positional_encoding(bev_mask).to(dtype)
if only_bev:
return self.transformer.get_bev_features(
mlvl_feats,
bev_queries,
self.bev_h,
self.bev_w,
grid_length=(self.real_h / self.bev_h,
self.real_w / self.bev_w),
bev_pos=bev_pos,
img_metas=img_metas,
prev_bev=prev_bev,
)
else:
outputs = self.transformer(
mlvl_feats,
bev_queries,
object_query_embeds,
self.bev_h,
self.bev_w,
grid_length=(self.real_h / self.bev_h,
self.real_w / self.bev_w),
bev_pos=bev_pos,
reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
cls_branches=self.cls_branches if self.as_two_stage else None,
img_metas=img_metas,
prev_bev=prev_bev
)
bev_embed, hs, init_reference, inter_references = outputs
hs = hs.permute(0, 2, 1, 3)
outputs_classes = []
outputs_coords = []
for lvl in range(hs.shape[0]):
if lvl == 0:
reference = init_reference
else:
reference = inter_references[lvl - 1]
reference = inverse_sigmoid(reference)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp = self.reg_branches[lvl](hs[lvl])
assert reference.shape[-1] == 3
tmp[..., 0:2] += reference[..., 0:2]
tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
tmp[..., 4:5] += reference[..., 2:3]
tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
self.pc_range[0]) + self.pc_range[0])
tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
self.pc_range[1]) + self.pc_range[1])
tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
self.pc_range[2]) + self.pc_range[2])
outputs_coord = tmp
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
outputs_classes = torch.stack(outputs_classes)
outputs_coords = torch.stack(outputs_coords)
outs = {
'bev_embed': bev_embed,
'all_cls_scores': outputs_classes,
'all_bbox_preds': outputs_coords,
'enc_cls_scores': None,
'enc_bbox_preds': None,
}
return outs
def loss(self,
gt_bboxes_list,
gt_labels_list,
preds_dicts,
gt_bboxes_ignore=None,
img_metas=None):
""""Loss function.
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
all_cls_scores = preds_dicts['all_cls_scores']
all_bbox_preds = preds_dicts['all_bbox_preds']
enc_cls_scores = preds_dicts['enc_cls_scores']
enc_bbox_preds = preds_dicts['enc_bbox_preds']
assert enc_cls_scores is None and enc_bbox_preds is None
num_dec_layers = len(all_cls_scores)
device = gt_labels_list[0].device
gt_bboxes_list = [torch.cat(
(gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
dim=1).to(device) for gt_bboxes in gt_bboxes_list]
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
loss_dict = dict()
loss_dict['loss_cls'] = 0
loss_dict['loss_bbox'] = 0
for num_dec_layer in range(all_cls_scores.shape[0] - 1):
loss_dict[f'd{num_dec_layer}.loss_cls'] = 0
loss_dict[f'd{num_dec_layer}.loss_bbox'] = 0
num_query_per_group = self.num_query // self.group_detr
for group_index in range(self.group_detr):
group_query_start = group_index * num_query_per_group
group_query_end = (group_index+1) * num_query_per_group
group_cls_scores = all_cls_scores[:, :,group_query_start:group_query_end, :]
group_bbox_preds = all_bbox_preds[:, :,group_query_start:group_query_end, :]
losses_cls, losses_bbox = multi_apply(
self.loss_single, group_cls_scores, group_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list,
all_gt_bboxes_ignore_list)
loss_dict['loss_cls'] += losses_cls[-1] / self.group_detr
loss_dict['loss_bbox'] += losses_bbox[-1] / self.group_detr
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] += loss_cls_i / self.group_detr
loss_dict[f'd{num_dec_layer}.loss_bbox'] += loss_bbox_i / self.group_detr
num_dec_layer += 1
return loss_dict
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment