_base_ = [ '../../../configs/_base_/datasets/nus-3d.py', '../../../configs/_base_/default_runtime.py', '../../../configs/_base_/schedules/cyclic-20e.py' ] backbone_norm_cfg = dict(type='LN', requires_grad=True) custom_imports = dict(imports=['projects.PETR.petr']) randomness = dict(seed=1, deterministic=False, diff_rank_seed=False) # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] voxel_size = [0.2, 0.2, 8] img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] metainfo = dict(classes=class_names) input_modality = dict(use_camera=True) model = dict( type='PETR', data_preprocessor=dict( type='Det3DDataPreprocessor', mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], bgr_to_rgb=False, pad_size_divisor=32), use_grid_mask=True, img_backbone=dict( type='VoVNetCP', spec_name='V-99-eSE', norm_eval=True, frozen_stages=-1, input_ch=3, out_features=( 'stage4', 'stage5', )), img_neck=dict( type='CPFPN', in_channels=[768, 1024], out_channels=256, num_outs=2), pts_bbox_head=dict( type='PETRHead', num_classes=10, in_channels=256, num_query=900, LID=True, with_position=True, with_multiview=True, position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], normedlinear=False, transformer=dict( type='PETRTransformer', decoder=dict( type='PETRTransformerDecoder', return_intermediate=True, num_layers=6, transformerlayers=dict( type='PETRTransformerDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=256, num_heads=8, attn_drop=0.1, dropout_layer=dict(type='Dropout', drop_prob=0.1)), dict( type='PETRMultiheadAttention', embed_dims=256, num_heads=8, attn_drop=0.1, dropout_layer=dict(type='Dropout', drop_prob=0.1)), ], feedforward_channels=2048, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), )), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=point_cloud_range, max_num=300, voxel_size=voxel_size, num_classes=10), positional_encoding=dict( type='SinePositionalEncoding3D', num_feats=128, normalize=True), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25), loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict( type='IoUCost', weight=0.0 ), # Fake cost. Just to be compatible with DETR head. pc_range=point_cloud_range)))) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' backend_args = None db_sampler = dict( data_root=data_root, info_path=data_root + 'nuscenes_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], backend_args=backend_args), backend_args=backend_args) ida_aug_conf = { 'resize_lim': (0.47, 0.625), 'final_dim': (320, 800), 'bot_pct_lim': (0.0, 0.0), 'rot_lim': (0.0, 0.0), 'H': 900, 'W': 1600, 'rand_flip': True, } train_pipeline = [ dict( type='LoadMultiViewImageFromFiles', to_float32=True, backend_args=backend_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True), dict( type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], translation_std=[0, 0, 0], scale_ratio_range=[0.95, 1.05], reverse_angle=False, training=True), dict( type='Pack3DDetInputs', keys=[ 'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths' ]) ] test_pipeline = [ dict( type='LoadMultiViewImageFromFiles', to_float32=True, backend_args=backend_args), dict( type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=False), dict(type='Pack3DDetInputs', keys=['img']) ] train_dataloader = dict( batch_size=1, num_workers=4, dataset=dict( type=dataset_type, data_prefix=dict( pts='samples/LIDAR_TOP', CAM_FRONT='samples/CAM_FRONT', CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', CAM_BACK='samples/CAM_BACK', CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', CAM_BACK_LEFT='samples/CAM_BACK_LEFT'), pipeline=train_pipeline, box_type_3d='LiDAR', metainfo=metainfo, test_mode=False, modality=input_modality, use_valid_flag=True, backend_args=backend_args)) test_dataloader = dict( dataset=dict( type=dataset_type, data_prefix=dict( pts='samples/LIDAR_TOP', CAM_FRONT='samples/CAM_FRONT', CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', CAM_BACK='samples/CAM_BACK', CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', CAM_BACK_LEFT='samples/CAM_BACK_LEFT'), pipeline=test_pipeline, box_type_3d='LiDAR', metainfo=metainfo, test_mode=True, modality=input_modality, use_valid_flag=True, backend_args=backend_args)) val_dataloader = dict( dataset=dict( type=dataset_type, data_prefix=dict( pts='samples/LIDAR_TOP', CAM_FRONT='samples/CAM_FRONT', CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', CAM_BACK='samples/CAM_BACK', CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', CAM_BACK_LEFT='samples/CAM_BACK_LEFT'), pipeline=test_pipeline, box_type_3d='LiDAR', metainfo=metainfo, test_mode=True, modality=input_modality, use_valid_flag=True, backend_args=backend_args)) # Different from original PETR: # We don't use special lr for image_backbone # This seems won't affect model performance optim_wrapper = dict( # TODO Add Amp # type='AmpOptimWrapper', # loss_scale='dynamic', optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), paramwise_cfg=dict(custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), clip_grad=dict(max_norm=35, norm_type=2)) num_epochs = 24 param_scheduler = [ dict( type='LinearLR', start_factor=1.0 / 3, begin=0, end=500, by_epoch=False), dict( type='CosineAnnealingLR', # TODO Figure out what T_max T_max=num_epochs, by_epoch=True, ) ] train_cfg = dict(max_epochs=num_epochs, val_interval=num_epochs) find_unused_parameters = False # pretrain_path can be found here: # https://drive.google.com/file/d/1ABI5BoQCkCkP4B0pO5KBJ3Ni0tei0gZi/view load_from = '/mnt/d/fcos3d_vovnet_imgbackbone-remapped.pth' resume = False # --------------Original--------------- # mAP: 0.3778 # mATE: 0.7463 # mASE: 0.2718 # mAOE: 0.4883 # mAVE: 0.9062 # mAAE: 0.2123 # NDS: 0.4264 # Eval time: 242.1s # Per-class results: # Object Class AP ATE ASE AOE AVE AAE # car 0.556 0.555 0.153 0.091 0.917 0.216 # truck 0.330 0.805 0.218 0.119 0.859 0.250 # bus 0.412 0.789 0.205 0.162 2.067 0.337 # trailer 0.221 0.976 0.233 0.663 0.797 0.146 # construction_vehicle 0.094 1.096 0.493 1.145 0.190 0.349 # pedestrian 0.453 0.688 0.289 0.636 0.549 0.235 # motorcycle 0.368 0.690 0.256 0.622 1.417 0.149 # bicycle 0.341 0.609 0.270 0.812 0.455 0.017 # traffic_cone 0.531 0.582 0.320 nan nan nan # barrier 0.472 0.673 0.281 0.145 nan nan # --------------Refactored in mmdet3d v1.0--------------- # mAP: 0.3827 # mATE: 0.7375 # mASE: 0.2703 # mAOE: 0.4799 # mAVE: 0.8699 # mAAE: 0.2038 # NDS: 0.4352 # Eval time: 124.8s # Per-class results: # Object Class AP ATE ASE AOE AVE AAE # car 0.574 0.519 0.150 0.087 0.865 0.206 # truck 0.349 0.773 0.213 0.117 0.855 0.220 # bus 0.423 0.781 0.204 0.122 1.902 0.319 # trailer 0.219 1.034 0.231 0.608 0.830 0.149 # construction_vehicle 0.084 1.062 0.486 1.245 0.172 0.360 # pedestrian 0.452 0.681 0.293 0.646 0.529 0.231 # motorcycle 0.378 0.670 0.250 0.567 1.334 0.130 # bicycle 0.347 0.639 0.264 0.788 0.472 0.016 # traffic_cone 0.538 0.553 0.325 nan nan nan # barrier 0.464 0.662 0.287 0.137 nan nan # --------------Refactored in mmdet3d v1.1--------------- # mAP: 0.3830 # mATE: 0.7547 # mASE: 0.2683 # mAOE: 0.4948 # mAVE: 0.8331 # mAAE: 0.2056 # NDS: 0.4358 # Eval time: 118.7s # Per-class results: # Object Class AP ATE ASE AOE AVE AAE # car 0.567 0.538 0.151 0.086 0.873 0.212 # truck 0.341 0.785 0.213 0.113 0.821 0.234 # bus 0.426 0.766 0.201 0.128 1.813 0.343 # trailer 0.216 1.116 0.227 0.649 0.640 0.122 # construction_vehicle 0.093 1.118 0.483 1.292 0.217 0.330 # pedestrian 0.453 0.685 0.293 0.644 0.535 0.238 # motorcycle 0.374 0.700 0.253 0.624 1.291 0.154 # bicycle 0.345 0.622 0.262 0.775 0.475 0.011 # traffic_cone 0.539 0.557 0.319 nan nan nan # barrier 0.476 0.661 0.279 0.142 nan nan