_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py', '../../../mmdetection3d/configs/_base_/default_runtime.py'] plugin = True plugin_dir = 'projects/mmdet3d_plugin/' point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } grid_config = { 'x': [-40, 40, 0.4], 'y': [-40, 40, 0.4], 'z': [-1, 5.4, 0.4], 'depth': [1.0, 45.0, 0.5], } voxel_size = [0.1, 0.1, 0.2] numC_Trans = 32 model = dict( type='BEVDetOCC', img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=False, with_cp=True, style='pytorch', pretrained='torchvision://resnet50', ), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=256, num_outs=1, start_level=0, out_ids=[0]), img_view_transformer=dict( type='LSSViewTransformer', grid_config=grid_config, input_size=data_config['input_size'], in_channels=256, out_channels=numC_Trans, sid=False, collapse_z=False, downsample=16), img_bev_encoder_backbone=dict( type='CustomResNet3D', numC_input=numC_Trans, num_layer=[1, 2, 4], with_cp=False, num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4], stride=[1, 2, 2], backbone_output_ids=[0, 1, 2]), img_bev_encoder_neck=dict(type='LSSFPN3D', in_channels=numC_Trans*7, out_channels=numC_Trans), occ_head=dict( type='BEVOCCHead3D', in_dim=numC_Trans, out_dim=32, use_mask=True, num_classes=18, use_predicter=True, class_balance=False, loss_occ=dict( type='CrossEntropyLoss', use_sigmoid=False, ignore_index=255, loss_weight=1.0 ), ) ) # Data dataset_type = 'NuScenesDatasetOccpancy' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') bda_aug_conf = dict( rot_lim=(-0., 0.), scale_lim=(1., 1.), flip_dx_ratio=0.5, flip_dy_ratio=0.5) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, data_config=data_config, sequential=False), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, is_train=True), dict(type='LoadOccGTFromFile'), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics', 'mask_lidar', 'mask_camera']) ] test_pipeline = [ dict(type='PrepareImageInputs', data_config=data_config, sequential=False), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs']) ]) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, data_root=data_root, classes=class_names, modality=input_modality, stereo=True, filter_empty_gt=False, img_info_prototype='bevdet', ) test_data_config = dict( pipeline=test_pipeline, ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( data_root=data_root, ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, use_valid_flag=True, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['val', 'train', 'test']: data[key].update(share_data_config) # Optimizer optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=200, warmup_ratio=0.001, step=[24, ]) runner = dict(type='EpochBasedRunner', max_epochs=24) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', ), ] load_from = "ckpts/bevdet-r50-cbgs.pth" # fp16 = dict(loss_scale='dynamic') evaluation = dict(interval=1, start=20, pipeline=test_pipeline) checkpoint_config = dict(interval=1, max_keep_ckpts=5) # with pretrain # ===> per class IoU of 6019 samples: # ===> others - IoU = 6.65 # ===> barrier - IoU = 36.97 # ===> bicycle - IoU = 8.33 # ===> bus - IoU = 38.69 # ===> car - IoU = 44.46 # ===> construction_vehicle - IoU = 15.21 # ===> motorcycle - IoU = 13.67 # ===> pedestrian - IoU = 16.39 # ===> traffic_cone - IoU = 15.27 # ===> trailer - IoU = 27.11 # ===> truck - IoU = 31.04 # ===> driveable_surface - IoU = 78.7 # ===> other_flat - IoU = 36.45 # ===> sidewalk - IoU = 48.27 # ===> terrain - IoU = 51.68 # ===> manmade - IoU = 36.82 # ===> vegetation - IoU = 32.09 # ===> mIoU of 6019 samples: 31.64 # with det pretrain; use_mask=False; class_balance=True # ===> per class IoU of 6019 samples: # ===> others - IoU = 4.36 # ===> barrier - IoU = 28.87 # ===> bicycle - IoU = 2.86 # ===> bus - IoU = 29.27 # ===> car - IoU = 32.45 # ===> construction_vehicle - IoU = 11.05 # ===> motorcycle - IoU = 12.82 # ===> pedestrian - IoU = 10.11 # ===> traffic_cone - IoU = 9.47 # ===> trailer - IoU = 7.93 # ===> truck - IoU = 21.58 # ===> driveable_surface - IoU = 49.85 # ===> other_flat - IoU = 25.5 # ===> sidewalk - IoU = 26.78 # ===> terrain - IoU = 21.14 # ===> manmade - IoU = 5.76 # ===> vegetation - IoU = 7.09 # ===> mIoU of 6019 samples: 18.05