# mAP: 0.3953
# mATE: 0.6941
# mASE: 0.2765
# mAOE: 0.4199
# mAVE: 0.7537
# mAAE: 0.1866
# NDS: 0.4646
_base_ = [
    '../_base_/default_runtime.py'
]
# Dataset
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
    'pedestrian', 'traffic_cone', 'trailer', 'truck'
]
dataset_type = 'CustomNuScenesDatasetV2'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
bev_h_ = 200
bev_w_ = 200
frames = (0,)
group_detr = 11
voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
ida_aug_conf = {
    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
    "crop": (0, 260, 1600, 900),
    "H": 900,
    "W": 1600,
    "rand_flip": True,
}
ida_aug_conf_eval = {
    "reisze": [640, ],
    "crop": (0, 260, 1600, 900),
    "H": 900,
    "W": 1600,
    "rand_flip": False,
}
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
#     }))
train_pipeline = [
    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
    dict(type='GlobalRotScaleTransImage',
        rot_range=[-22.5, 22.5],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0],
        reverse_angle=True,
        training=True,
        flip_dx_ratio=0.5,
        flip_dy_ratio=0.5,
        only_gt=True,),
    dict(
        type='ObjectRangeFilter',
        point_cloud_range=point_cloud_range),
    dict(
        type='ObjectNameFilter',
        classes=class_names),
    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    dict(type='PadMultiViewImage', size_divisor=32),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
        type='CustomCollect3D',
        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
    dict(type='DD3DMapper',
         is_train=True,
         tasks=dict(box2d_on=True, box3d_on=True),)
]
eval_pipeline = [
    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    dict(type='PadMultiViewImage', size_divisor=32),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1600, 640),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='CustomCollect3D',
                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
                       'lidar2ego_rotation', 'timestamp'])
        ])
]

data = dict(
    samples_per_gpu=1,
    workers_per_gpu=4,
    persistent_workers=True,
    train=dict(
        type='CustomNuScenesDatasetV2',
        frames=frames,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        box_type_3d='LiDAR',
        mono_cfg=dict(
            name='nusc_trainval',
            data_root='data/nuscenes/',
            min_num_lidar_points=3,
            min_box_visibility=0.2)),
    val=dict(
        type='CustomNuScenesDatasetV2',
        frames=frames,
        data_root='data/nuscenes/',
        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
        pipeline=eval_pipeline,
        classes=class_names,
        modality=input_modality,
        samples_per_gpu=1),
    test=dict(
        type='CustomNuScenesDatasetV2',
        frames=frames,
        data_root='data/nuscenes/',
        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
        pipeline=eval_pipeline,
        classes=class_names,
        modality=input_modality),
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(interval=4, pipeline=eval_pipeline)

# model
load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
_num_mono_levels_ = 5

model = dict(
    type='BEVFormerV2',
    use_grid_mask=True,
    video_test_mode=False,
    num_levels=_num_levels_,
    num_mono_levels=_num_mono_levels_,
    mono_loss_weight=1.0,
    frames=frames,
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(1, 2, 3),
        frozen_stages=-1,
        norm_cfg=dict(type='SyncBN'),
        norm_eval=False,
        style='caffe'),
    img_neck=dict(
        type='FPN',
        in_channels=[512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=_num_mono_levels_,
        relu_before_extra_convs=True),
    pts_bbox_head=dict(
        type='BEVFormerHead_GroupDETR',
        group_detr=group_detr,
        bev_h=bev_h_,
        bev_w=bev_w_,
        num_query=900,
        num_classes=10,
        in_channels=_dim_,
        sync_cls_avg_factor=True,
        with_box_refine=True,
        as_two_stage=False,
        transformer=dict(
            type='PerceptionTransformerV2',
            embed_dims=_dim_,
            frames=frames,
            encoder=dict(
                type='BEVFormerEncoder',
                num_layers=6,
                pc_range=point_cloud_range,
                num_points_in_pillar=4,
                return_intermediate=False,
                transformerlayers=dict(
                    type='BEVFormerLayer',
                    attn_cfgs=[
                        dict(
                            type='TemporalSelfAttention',
                            embed_dims=_dim_,
                            num_levels=1),
                        dict(
                            type='SpatialCrossAttention',
                            pc_range=point_cloud_range,
                            deformable_attention=dict(
                                type='MSDeformableAttention3D',
                                embed_dims=_dim_,
                                num_points=8,
                                num_levels=4),
                            embed_dims=_dim_)
                    ],
                    feedforward_channels=_ffn_dim_,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm'))),
            decoder=dict(
                type='DetectionTransformerDecoder',
                num_layers=6,
                return_intermediate=True,
                transformerlayers=dict(
                    type='DetrTransformerDecoderLayer',
                    attn_cfgs=[
                        dict(
                            type='GroupMultiheadAttention',
                            group=group_detr,
                            embed_dims=_dim_,
                            num_heads=8,
                            dropout=0.1),
                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=_dim_,
                            num_levels=1)
                    ],
                    feedforward_channels=_ffn_dim_,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm')))),
        bbox_coder=dict(
            type='NMSFreeCoder',
            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            pc_range=point_cloud_range,
            max_num=300,
            voxel_size=voxel_size,
            num_classes=10),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=_pos_dim_,
            row_num_embed=bev_h_,
            col_num_embed=bev_w_),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=2.0),
        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
    fcos3d_bbox_head=dict(
        type='NuscenesDD3D',
        num_classes=10,
        in_channels=_dim_,
        strides=[8, 16, 32, 64, 128],
        box3d_on=True,
        feature_locations_offset='none',
        fcos2d_cfg=dict(
            num_cls_convs=4,
            num_box_convs=4,
            norm='SyncBN',
            use_deformable=False,
            use_scale=True,
            box2d_scale_init_factor=1.0),
        fcos2d_loss_cfg=dict(
            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
        fcos3d_cfg=dict(
            num_convs=4,
            norm='SyncBN',
            use_scale=True,
            depth_scale_init_factor=0.3,
            proj_ctr_scale_init_factor=1.0,
            use_per_level_predictors=False,
            class_agnostic=False,
            use_deformable=False,
            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
        fcos3d_loss_cfg=dict(
            min_depth=0.1,
            max_depth=80.0,
            box3d_loss_weight=2.0,
            conf3d_loss_weight=1.0,
            conf_3d_temperature=1.0,
            smooth_l1_loss_beta=0.05,
            max_loss_per_group=20,
            predict_allocentric_rot=True,
            scale_depth_by_focal_lengths=True,
            scale_depth_by_focal_lengths_factor=500.0,
            class_agnostic=False,
            predict_distance=False,
            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
                             [0.61416006, 1.7016163, 1.3054738],
                             [2.9139307, 10.725025, 3.2832346],
                             [1.9751819, 4.641267, 1.74352],
                             [2.772134, 6.565072, 3.2474296],
                             [0.7800532, 2.138673, 1.4437162],
                             [0.6667362, 0.7181772, 1.7616143],
                             [0.40246472, 0.4027083, 1.0084083],
                             [3.0059454, 12.8197, 4.1213827],
                             [2.4986045, 6.9310856, 2.8382742]]),
        target_assign_cfg=dict(
            center_sample=True,
            pos_radius=1.5,
            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
                               (512, 100000000.0))),
        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
    train_cfg=dict(
        pts=dict(
            grid_size=[512, 512, 1],
            voxel_size=voxel_size,
            point_cloud_range=point_cloud_range,
            out_size_factor=4,
            assigner=dict(
                type='HungarianAssigner3D',
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
                iou_cost=dict(type='IoUCost', weight=0.0),
                pc_range=point_cloud_range))))

# optimizer
optimizer = dict(
    type='AdamW',
    lr=4e-4,
    paramwise_cfg=dict(
        custom_keys=dict(
            img_backbone=dict(lr_mult=0.5),
        )),
    weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=2000,
    warmup_ratio=1.0 / 3,
    step=[44, ])
total_epochs = 48
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)