init_0905

3b8d508a · lishj6 · e968ab0f · 3b8d508a · 3b8d508a · 3b8d508a
Commit 3b8d508a authored Sep 05, 2025 by lishj6 🏸
20 changed files
--- a/projects/configs/flashocc/flashocc-r50.py
+++ b/projects/configs/flashocc/flashocc-r50.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,    # out_dim=128 for M0!!!
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=34,
+    workers_per_gpu=34,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=30)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# with det pretrain; use_mask=True;
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.74
+# ===> barrier - IoU = 37.65
+# ===> bicycle - IoU = 10.26
+# ===> bus - IoU = 39.55
+# ===> car - IoU = 44.36
+# ===> construction_vehicle - IoU = 14.88
+# ===> motorcycle - IoU = 13.4
+# ===> pedestrian - IoU = 15.79
+# ===> traffic_cone - IoU = 15.38
+# ===> trailer - IoU = 27.44
+# ===> truck - IoU = 31.73
+# ===> driveable_surface - IoU = 78.82
+# ===> other_flat - IoU = 37.98
+# ===> sidewalk - IoU = 48.7
+# ===> terrain - IoU = 52.5
+# ===> manmade - IoU = 37.89
+# ===> vegetation - IoU = 32.24
+# ===> mIoU of 6019 samples: 32.08
+# with det pretrain; use_mask=False; class_balance=True
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.49
+# ===> barrier - IoU = 29.59
+# ===> bicycle - IoU = 7.38
+# ===> bus - IoU = 30.32
+# ===> car - IoU = 32.22
+# ===> construction_vehicle - IoU = 13.04
+# ===> motorcycle - IoU = 11.91
+# ===> pedestrian - IoU = 8.61
+# ===> traffic_cone - IoU = 8.11
+# ===> trailer - IoU = 7.66
+# ===> truck - IoU = 20.84
+# ===> driveable_surface - IoU = 48.59
+# ===> other_flat - IoU = 26.62
+# ===> sidewalk - IoU = 26.08
+# ===> terrain - IoU = 20.86
+# ===> manmade - IoU = 7.62
+# ===> vegetation - IoU = 7.14
+# ===> mIoU of 6019 samples: 18.3
--- a/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+++ b/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        with_cp=True,
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    data_root=data_root,
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+evaluation = dict(interval=6, start=0, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=3)
+# load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+resume_from="work_dirs/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2/epoch_5.pth"
+# fp16 = dict(loss_scale='dynamic')
+# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py 4
\ No newline at end of file
--- a/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+++ b/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        with_cp=True,
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_wise=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    data_root=data_root,
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=2e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+evaluation = dict(interval=6, start=0, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=3)
+load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408.py 4
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 13.42
+# ===> barrier - IoU = 51.07
+# ===> bicycle - IoU = 27.68
+# ===> bus - IoU = 51.57
+# ===> car - IoU = 56.22
+# ===> construction_vehicle - IoU = 27.27
+# ===> motorcycle - IoU = 29.98
+# ===> pedestrian - IoU = 29.93
+# ===> traffic_cone - IoU = 29.8
+# ===> trailer - IoU = 37.77
+# ===> truck - IoU = 43.52
+# ===> driveable_surface - IoU = 83.81
+# ===> other_flat - IoU = 46.55
+# ===> sidewalk - IoU = 56.15
+# ===> terrain - IoU = 59.56
+# ===> manmade - IoU = 50.84
+# ===> vegetation - IoU = 44.67
+# ===> mIoU of 6019 samples: 43.52
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 13.31
+# ===> barrier - IoU = 51.62
+# ===> bicycle - IoU = 28.07
+# ===> bus - IoU = 50.91
+# ===> car - IoU = 55.69
+# ===> construction_vehicle - IoU = 27.46
+# ===> motorcycle - IoU = 31.05
+# ===> pedestrian - IoU = 29.98
+# ===> traffic_cone - IoU = 29.2
+# ===> trailer - IoU = 38.86
+# ===> truck - IoU = 43.68
+# ===> driveable_surface - IoU = 83.87
+# ===> other_flat - IoU = 45.63
+# ===> sidewalk - IoU = 56.33
+# ===> terrain - IoU = 59.01
+# ===> manmade - IoU = 50.63
+# ===> vegetation - IoU = 44.56
+# ===> mIoU of 6019 samples: 43.52
+# {'mIoU': array([0.13311691, 0.51617081, 0.28070517, 0.50911942, 0.55694228,
+#        0.27461342, 0.31050779, 0.29979125, 0.29204287, 0.38862984,
+#        0.43680049, 0.83872518, 0.45630227, 0.56327839, 0.59008883,
+#        0.50627122, 0.44564523, 0.90959399])}
\ No newline at end of file
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+model = dict(
+    type='BEVDepthPano',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=256,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.21
+# ===> barrier - IoU = 42.14
+# ===> bicycle - IoU = 22.82
+# ===> bus - IoU = 40.13
+# ===> car - IoU = 42.86
+# ===> construction_vehicle - IoU = 20.69
+# ===> motorcycle - IoU = 24.58
+# ===> pedestrian - IoU = 23.7
+# ===> traffic_cone - IoU = 24.02
+# ===> trailer - IoU = 25.48
+# ===> truck - IoU = 30.9
+# ===> driveable_surface - IoU = 58.65
+# ===> other_flat - IoU = 32.04
+# ===> sidewalk - IoU = 34.27
+# ===> terrain - IoU = 31.12
+# ===> manmade - IoU = 18.26
+# ===> vegetation - IoU = 17.79
+# ===> mIoU of 6019 samples: 29.39
+# {'mIoU': array([0.102, 0.421, 0.228, 0.401, 0.429, 0.207, 0.246, 0.237, 0.24 ,
+#        0.255, 0.309, 0.586, 0.32 , 0.343, 0.311, 0.183, 0.178, 0.833])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.090   |  0.102   |  0.105   |
+# |       barrier        |  0.387   |  0.442   |  0.465   |
+# |       bicycle        |  0.218   |  0.257   |  0.265   |
+# |         bus          |  0.514   |  0.613   |  0.669   |
+# |         car          |  0.487   |  0.564   |  0.592   |
+# | construction_vehicle |  0.176   |  0.254   |  0.288   |
+# |      motorcycle      |  0.203   |  0.292   |  0.310   |
+# |      pedestrian      |  0.301   |  0.349   |  0.366   |
+# |     traffic_cone     |  0.280   |  0.313   |  0.321   |
+# |       trailer        |  0.227   |  0.313   |  0.390   |
+# |        truck         |  0.395   |  0.493   |  0.537   |
+# |  driveable_surface   |  0.534   |  0.618   |  0.708   |
+# |      other_flat      |  0.289   |  0.326   |  0.356   |
+# |       sidewalk       |  0.234   |  0.280   |  0.329   |
+# |       terrain        |  0.222   |  0.291   |  0.356   |
+# |       manmade        |  0.280   |  0.351   |  0.401   |
+# |      vegetation      |  0.176   |  0.273   |  0.359   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.295   |  0.361   |  0.401   |
+# +----------------------+----------+----------+----------+
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.017  |  0.025  |  0.026  |
+# |       barrier        |  0.125  |  0.182  |  0.218  |
+# |       bicycle        |  0.051  |  0.072  |  0.076  |
+# |         bus          |  0.275  |  0.366  |  0.422  |
+# |         car          |  0.242  |  0.332  |  0.356  |
+# | construction_vehicle |  0.016  |  0.058  |  0.092  |
+# |      motorcycle      |  0.071  |  0.124  |  0.137  |
+# |      pedestrian      |  0.017  |  0.022  |  0.023  |
+# |     traffic_cone     |  0.032  |  0.040  |  0.044  |
+# |       trailer        |  0.035  |  0.055  |  0.063  |
+# |        truck         |  0.145  |  0.232  |  0.282  |
+# |  driveable_surface   |  0.410  |  0.537  |  0.665  |
+# |      other_flat      |  0.062  |  0.087  |  0.109  |
+# |       sidewalk       |  0.008  |  0.030  |  0.064  |
+# |       terrain        |  0.010  |  0.026  |  0.047  |
+# |       manmade        |  0.054  |  0.091  |  0.134  |
+# |      vegetation      |  0.003  |  0.022  |  0.092  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.092  |  0.135  |  0.168  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.35223182059688496, 'RayIoU@1': 0.29499743138394385, 'RayIoU@2': 0.3607063492639709, 'RayIoU@4': 0.4009916811427401, 
+#  'RayPQ': 0.13182524545677765, 'RayPQ@1': 0.09247682620339576, 'RayPQ@2': 0.1354024129684159, 'RayPQ@4': 0.16759649719852124}
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+_base_ = ['./flashoccv2-r50-depth-tiny-pano.py',
+          ]
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDepthPano',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=128,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.33
+# ===> barrier - IoU = 41.02
+# ===> bicycle - IoU = 22.16
+# ===> bus - IoU = 39.75
+# ===> car - IoU = 42.63
+# ===> construction_vehicle - IoU = 20.53
+# ===> motorcycle - IoU = 24.01
+# ===> pedestrian - IoU = 23.71
+# ===> traffic_cone - IoU = 24.65
+# ===> trailer - IoU = 25.58
+# ===> truck - IoU = 30.63
+# ===> driveable_surface - IoU = 58.0
+# ===> other_flat - IoU = 32.12
+# ===> sidewalk - IoU = 33.78
+# ===> terrain - IoU = 31.02
+# ===> manmade - IoU = 17.67
+# ===> vegetation - IoU = 17.74
+# ===> mIoU of 6019 samples: 29.14
+# {'mIoU': array([0.103, 0.41 , 0.222, 0.397, 0.426, 0.205, 0.24 , 0.237, 0.246,
+#        0.256, 0.306, 0.58 , 0.321, 0.338, 0.31 , 0.177, 0.177, 0.832])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.095   |  0.107   |  0.110   |
+# |       barrier        |  0.374   |  0.429   |  0.452   |
+# |       bicycle        |  0.208   |  0.242   |  0.248   |
+# |         bus          |  0.498   |  0.603   |  0.659   |
+# |         car          |  0.489   |  0.568   |  0.598   |
+# | construction_vehicle |  0.171   |  0.247   |  0.279   |
+# |      motorcycle      |  0.190   |  0.277   |  0.298   |
+# |      pedestrian      |  0.295   |  0.344   |  0.361   |
+# |     traffic_cone     |  0.290   |  0.324   |  0.332   |
+# |       trailer        |  0.207   |  0.292   |  0.368   |
+# |        truck         |  0.411   |  0.507   |  0.551   |
+# |  driveable_surface   |  0.531   |  0.614   |  0.704   |
+# |      other_flat      |  0.286   |  0.325   |  0.357   |
+# |       sidewalk       |  0.234   |  0.280   |  0.328   |
+# |       terrain        |  0.220   |  0.290   |  0.356   |
+# |       manmade        |  0.267   |  0.343   |  0.392   |
+# |      vegetation      |  0.174   |  0.272   |  0.358   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.291   |  0.357   |  0.397   |
+# +----------------------+----------+----------+----------+
+# 6019it [09:34, 10.48it/s]
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.017  |  0.024  |  0.025  |
+# |       barrier        |  0.107  |  0.169  |  0.204  |
+# |       bicycle        |  0.069  |  0.086  |  0.088  |
+# |         bus          |  0.244  |  0.350  |  0.408  |
+# |         car          |  0.238  |  0.326  |  0.352  |
+# | construction_vehicle |  0.018  |  0.081  |  0.105  |
+# |      motorcycle      |  0.061  |  0.105  |  0.117  |
+# |      pedestrian      |  0.016  |  0.022  |  0.023  |
+# |     traffic_cone     |  0.030  |  0.049  |  0.052  |
+# |       trailer        |  0.029  |  0.047  |  0.056  |
+# |        truck         |  0.151  |  0.240  |  0.286  |
+# |  driveable_surface   |  0.407  |  0.531  |  0.662  |
+# |      other_flat      |  0.054  |  0.078  |  0.098  |
+# |       sidewalk       |  0.009  |  0.030  |  0.061  |
+# |       terrain        |  0.006  |  0.022  |  0.045  |
+# |       manmade        |  0.044  |  0.091  |  0.128  |
+# |      vegetation      |  0.001  |  0.021  |  0.091  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.088  |  0.134  |  0.165  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.34819957391233375, 'RayIoU@1': 0.29065973127346445, 'RayIoU@2': 0.3566749015912661, 'RayIoU@4': 0.39726408887227066, 
+#  'RayPQ': 0.12890890185841564, 'RayPQ@1': 0.08832135839934552, 'RayPQ@2': 0.1336058084882046, 'RayPQ@4': 0.1647995386876968}
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDepthOCC',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.69
+# ===> barrier - IoU = 39.67
+# ===> bicycle - IoU = 22.01
+# ===> bus - IoU = 39.99
+# ===> car - IoU = 40.46
+# ===> construction_vehicle - IoU = 20.44
+# ===> motorcycle - IoU = 24.52
+# ===> pedestrian - IoU = 22.5
+# ===> traffic_cone - IoU = 23.72
+# ===> trailer - IoU = 25.93
+# ===> truck - IoU = 29.75
+# ===> driveable_surface - IoU = 58.29
+# ===> other_flat - IoU = 31.46
+# ===> sidewalk - IoU = 33.92
+# ===> terrain - IoU = 31.25
+# ===> manmade - IoU = 17.46
+# ===> vegetation - IoU = 17.97
+# ===> mIoU of 6019 samples: 28.83
+# {'mIoU': array([0.1068576 , 0.3967071 , 0.220114  , 0.3998965 , 0.40462457,
+#        0.20442682, 0.24516316, 0.22497209, 0.23719173, 0.25925541,
+#        0.29754347, 0.58293305, 0.31458314, 0.33921965, 0.31254221,
+#        0.17456574, 0.17970859, 0.8315865 ])}
+# Starting Evaluation...
+# 6019it [10:23,  9.65it/s]
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.094   |  0.107   |  0.111   |
+# |       barrier        |  0.367   |  0.421   |  0.443   |
+# |       bicycle        |  0.209   |  0.251   |  0.261   |
+# |         bus          |  0.498   |  0.601   |  0.665   |
+# |         car          |  0.472   |  0.550   |  0.581   |
+# | construction_vehicle |  0.175   |  0.251   |  0.287   |
+# |      motorcycle      |  0.205   |  0.292   |  0.315   |
+# |      pedestrian      |  0.289   |  0.339   |  0.354   |
+# |     traffic_cone     |  0.276   |  0.302   |  0.314   |
+# |       trailer        |  0.203   |  0.289   |  0.380   |
+# |        truck         |  0.396   |  0.493   |  0.546   |
+# |  driveable_surface   |  0.528   |  0.611   |  0.702   |
+# |      other_flat      |  0.280   |  0.315   |  0.346   |
+# |       sidewalk       |  0.233   |  0.279   |  0.328   |
+# |       terrain        |  0.218   |  0.286   |  0.353   |
+# |       manmade        |  0.268   |  0.347   |  0.398   |
+# |      vegetation      |  0.174   |  0.272   |  0.358   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.287   |  0.353   |  0.397   |
+# +----------------------+----------+----------+----------+
+# {'RayIoU': 0.34574739050176573, 'RayIoU@1': 0.2873820616941079, 'RayIoU@2': 0.3533573712072785,
+# 'RayIoU@4': 0.39650273860391083}
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+_base_ = ['./flashoccv2-r50-depth.py',
+          ]
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+model = dict(
+    type='BEVDepthPano',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=256,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.090   |  0.102   |  0.105   |
+# |       barrier        |  0.387   |  0.442   |  0.465   |
+# |       bicycle        |  0.218   |  0.257   |  0.265   |
+# |         bus          |  0.514   |  0.613   |  0.669   |
+# |         car          |  0.487   |  0.564   |  0.592   |
+# | construction_vehicle |  0.176   |  0.254   |  0.288   |
+# |      motorcycle      |  0.203   |  0.292   |  0.310   |
+# |      pedestrian      |  0.301   |  0.349   |  0.366   |
+# |     traffic_cone     |  0.280   |  0.313   |  0.321   |
+# |       trailer        |  0.227   |  0.313   |  0.390   |
+# |        truck         |  0.395   |  0.493   |  0.537   |
+# |  driveable_surface   |  0.534   |  0.618   |  0.708   |
+# |      other_flat      |  0.289   |  0.326   |  0.356   |
+# |       sidewalk       |  0.234   |  0.280   |  0.329   |
+# |       terrain        |  0.222   |  0.291   |  0.356   |                                                                                                                                                                                                                                                        
+# |       manmade        |  0.280   |  0.351   |  0.401   |                                                                                                                                                                                                                                                        
+# |      vegetation      |  0.176   |  0.273   |  0.359   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.295   |  0.361   |  0.401   |
+# +----------------------+----------+----------+----------+
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.017  |  0.025  |  0.026  |
+# |       barrier        |  0.125  |  0.182  |  0.218  |
+# |       bicycle        |  0.051  |  0.072  |  0.076  |
+# |         bus          |  0.275  |  0.366  |  0.422  |
+# |         car          |  0.242  |  0.332  |  0.356  |
+# | construction_vehicle |  0.016  |  0.058  |  0.092  |
+# |      motorcycle      |  0.071  |  0.124  |  0.137  |
+# |      pedestrian      |  0.017  |  0.022  |  0.023  |
+# |     traffic_cone     |  0.032  |  0.040  |  0.044  |
+# |       trailer        |  0.035  |  0.055  |  0.063  |
+# |        truck         |  0.145  |  0.232  |  0.282  |
+# |  driveable_surface   |  0.410  |  0.537  |  0.665  |
+# |      other_flat      |  0.062  |  0.087  |  0.109  |
+# |       sidewalk       |  0.008  |  0.030  |  0.064  |
+# |       terrain        |  0.010  |  0.026  |  0.047  |
+# |       manmade        |  0.054  |  0.091  |  0.134  |
+# |      vegetation      |  0.003  |  0.022  |  0.092  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.092  |  0.135  |  0.168  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.35223182059688496, 'RayIoU@1': 0.29499743138394385, 'RayIoU@2': 0.3607063492639709, 'RayIoU@4': 0.4009916811427401, 'RayPQ': 0.13182524545677765, 'RayPQ@1': 0.09247682620339576, 'RayPQ@2': 0.1354024129684159, 'RayPQ@4': 0.16759649719852124}
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f-pano.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 16+1, 1)
+model = dict(
+    type='BEVDepth4DPano',
+    num_adj=multi_adj_frame_id_cfg[1]-1,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=512,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=256,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 16+1, 1)
+model = dict(
+    type='BEVDepth4DOCC',
+    num_adj=multi_adj_frame_id_cfg[1]-1,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=512,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 11.94
+# ===> barrier - IoU = 44.84
+# ===> bicycle - IoU = 26.66
+# ===> bus - IoU = 41.53
+# ===> car - IoU = 44.42
+# ===> construction_vehicle - IoU = 20.79
+# ===> motorcycle - IoU = 26.96
+# ===> pedestrian - IoU = 25.98
+# ===> traffic_cone - IoU = 29.25
+# ===> trailer - IoU = 24.24
+# ===> truck - IoU = 32.28
+# ===> driveable_surface - IoU = 60.5
+# ===> other_flat - IoU = 33.07
+# ===> sidewalk - IoU = 37.01
+# ===> terrain - IoU = 33.54
+# ===> manmade - IoU = 21.75
+# ===> vegetation - IoU = 21.58
+# ===> mIoU of 6019 samples: 31.55
+# {'mIoU': array([0.119, 0.448, 0.267, 0.415, 0.444, 0.208, 0.27 , 0.26 , 0.293,
+#        0.242, 0.323, 0.605, 0.331, 0.37 , 0.335, 0.217, 0.216, 0.839])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.110   |  0.118   |  0.119   |
+# |       barrier        |  0.444   |  0.484   |  0.499   |
+# |       bicycle        |  0.278   |  0.311   |  0.319   |
+# |         bus          |  0.537   |  0.635   |  0.691   |
+# |         car          |  0.512   |  0.585   |  0.611   |
+# | construction_vehicle |  0.153   |  0.218   |  0.238   |
+# |      motorcycle      |  0.228   |  0.310   |  0.330   |
+# |      pedestrian      |  0.338   |  0.387   |  0.401   |
+# |     traffic_cone     |  0.342   |  0.362   |  0.370   |
+# |       trailer        |  0.209   |  0.293   |  0.368   |
+# |        truck         |  0.422   |  0.511   |  0.555   |
+# |  driveable_surface   |  0.570   |  0.653   |  0.742   |
+# |      other_flat      |  0.301   |  0.340   |  0.375   |
+# |       sidewalk       |  0.266   |  0.319   |  0.370   |
+# |       terrain        |  0.261   |  0.334   |  0.400   |
+# |       manmade        |  0.360   |  0.435   |  0.485   |
+# |      vegetation      |  0.244   |  0.354   |  0.442   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.328   |  0.391   |  0.430   |
+# +----------------------+----------+----------+----------+
+# {'RayIoU': 0.38313147213727416, 'RayIoU@1': 0.3279517851047602, 'RayIoU@2': 0.3911038935232673, 'RayIoU@4': 0.4303387377837949}
\ No newline at end of file
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f-pano.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 8+1, 1)
+model = dict(
+    type='BEVDepth4DPano',
+    num_adj=multi_adj_frame_id_cfg[1]-1,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=512,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=256,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 11.51
+# ===> barrier - IoU = 45.87
+# ===> bicycle - IoU = 24.65
+# ===> bus - IoU = 41.75
+# ===> car - IoU = 46.15
+# ===> construction_vehicle - IoU = 20.96
+# ===> motorcycle - IoU = 26.82
+# ===> pedestrian - IoU = 26.77
+# ===> traffic_cone - IoU = 29.66
+# ===> trailer - IoU = 24.65
+# ===> truck - IoU = 32.75
+# ===> driveable_surface - IoU = 60.39
+# ===> other_flat - IoU = 32.87
+# ===> sidewalk - IoU = 36.49
+# ===> terrain - IoU = 33.16
+# ===> manmade - IoU = 21.3
+# ===> vegetation - IoU = 20.92
+# ===> mIoU of 6019 samples: 31.57
+# {'mIoU': array([0.115, 0.459, 0.247, 0.418, 0.461, 0.21 , 0.268, 0.268, 0.297,
+#        0.247, 0.328, 0.604, 0.329, 0.365, 0.332, 0.213, 0.209, 0.839])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.101   |  0.108   |  0.110   |
+# |       barrier        |  0.439   |  0.480   |  0.497   |
+# |       bicycle        |  0.258   |  0.286   |  0.293   |
+# |         bus          |  0.540   |  0.649   |  0.700   |
+# |         car          |  0.531   |  0.603   |  0.629   |
+# | construction_vehicle |  0.180   |  0.252   |  0.282   |
+# |      motorcycle      |  0.247   |  0.328   |  0.343   |
+# |      pedestrian      |  0.347   |  0.393   |  0.409   |
+# |     traffic_cone     |  0.346   |  0.371   |  0.378   |
+# |       trailer        |  0.209   |  0.292   |  0.384   |
+# |        truck         |  0.452   |  0.544   |  0.587   |
+# |  driveable_surface   |  0.562   |  0.646   |  0.734   |
+# |      other_flat      |  0.290   |  0.328   |  0.363   |
+# |       sidewalk       |  0.261   |  0.313   |  0.363   |
+# |       terrain        |  0.260   |  0.330   |  0.394   |
+# |       manmade        |  0.345   |  0.421   |  0.471   |
+# |      vegetation      |  0.229   |  0.337   |  0.423   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.329   |  0.393   |  0.433   |
+# +----------------------+----------+----------+----------+
+# 6019it [10:36,  9.46it/s]
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.026  |  0.032  |  0.033  |
+# |       barrier        |  0.184  |  0.232  |  0.253  |
+# |       bicycle        |  0.088  |  0.103  |  0.108  |
+# |         bus          |  0.311  |  0.406  |  0.458  |
+# |         car          |  0.300  |  0.380  |  0.403  |
+# | construction_vehicle |  0.032  |  0.057  |  0.081  |
+# |      motorcycle      |  0.114  |  0.156  |  0.169  |
+# |      pedestrian      |  0.025  |  0.030  |  0.031  |
+# |     traffic_cone     |  0.071  |  0.081  |  0.085  |
+# |       trailer        |  0.049  |  0.077  |  0.088  |
+# |        truck         |  0.182  |  0.274  |  0.314  |
+# |  driveable_surface   |  0.457  |  0.574  |  0.702  |
+# |      other_flat      |  0.062  |  0.086  |  0.106  |
+# |       sidewalk       |  0.018  |  0.042  |  0.091  |
+# |       terrain        |  0.017  |  0.039  |  0.074  |
+# |       manmade        |  0.077  |  0.144  |  0.194  |
+# |      vegetation      |  0.002  |  0.061  |  0.162  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.119  |  0.163  |  0.197  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.3850202377154096, 'RayIoU@1': 0.3291477679560127, 'RayIoU@2': 0.39307010079658805, 'RayIoU@4': 0.4328428443936281, 
+#  'RayPQ': 0.15961266397677248, 'RayPQ@1': 0.11850092407498894, 'RayPQ@2': 0.1631862461686837, 'RayPQ@4': 0.19715082168664483}
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 8+1, 1)
+model = dict(
+    type='BEVDepth4DOCC',
+    num_adj=multi_adj_frame_id_cfg[1]-1,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=512,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 11.5
+# ===> barrier - IoU = 44.1
+# ===> bicycle - IoU = 25.89
+# ===> bus - IoU = 41.0
+# ===> car - IoU = 44.57
+# ===> construction_vehicle - IoU = 21.88
+# ===> motorcycle - IoU = 27.31
+# ===> pedestrian - IoU = 25.95
+# ===> traffic_cone - IoU = 29.04
+# ===> trailer - IoU = 24.17
+# ===> truck - IoU = 31.81
+# ===> driveable_surface - IoU = 60.74
+# ===> other_flat - IoU = 33.84
+# ===> sidewalk - IoU = 36.62
+# ===> terrain - IoU = 33.96
+# ===> manmade - IoU = 21.54
+# ===> vegetation - IoU = 21.36
+# ===> mIoU of 6019 samples: 31.49
+# {'mIoU': array([0.115, 0.441, 0.259, 0.41 , 0.446, 0.219, 0.273, 0.259, 0.29 ,
+#        0.242, 0.318, 0.607, 0.338, 0.366, 0.34 , 0.215, 0.214, 0.839])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.107   |  0.115   |  0.116   |
+# |       barrier        |  0.442   |  0.485   |  0.501   |
+# |       bicycle        |  0.267   |  0.296   |  0.302   |
+# |         bus          |  0.533   |  0.632   |  0.683   |
+# |         car          |  0.516   |  0.590   |  0.616   |
+# | construction_vehicle |  0.170   |  0.251   |  0.282   |
+# |      motorcycle      |  0.231   |  0.325   |  0.350   |
+# |      pedestrian      |  0.340   |  0.386   |  0.400   |
+# |     traffic_cone     |  0.348   |  0.372   |  0.380   |
+# |       trailer        |  0.232   |  0.317   |  0.400   |
+# |        truck         |  0.427   |  0.514   |  0.559   |
+# |  driveable_surface   |  0.566   |  0.649   |  0.736   |
+# |      other_flat      |  0.302   |  0.341   |  0.374   |
+# |       sidewalk       |  0.261   |  0.313   |  0.363   |
+# |       terrain        |  0.258   |  0.333   |  0.399   |
+# |       manmade        |  0.348   |  0.426   |  0.479   |
+# |      vegetation      |  0.234   |  0.342   |  0.430   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.328   |  0.393   |  0.434   |
+# +----------------------+----------+----------+----------+
+# {'RayIoU': 0.3851476341258822, 'RayIoU@1': 0.3284556495395326, 'RayIoU@2': 0.39334760720480005, 'RayIoU@4': 0.43363964563331386}
\ No newline at end of file
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-pano.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVDepth4DPano',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=512,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=256,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.89
+# ===> barrier - IoU = 43.92
+# ===> bicycle - IoU = 24.42
+# ===> bus - IoU = 41.91
+# ===> car - IoU = 45.18
+# ===> construction_vehicle - IoU = 18.73
+# ===> motorcycle - IoU = 25.59
+# ===> pedestrian - IoU = 25.67
+# ===> traffic_cone - IoU = 25.86
+# ===> trailer - IoU = 25.29
+# ===> truck - IoU = 31.84
+# ===> driveable_surface - IoU = 59.03
+# ===> other_flat - IoU = 31.53
+# ===> sidewalk - IoU = 34.67
+# ===> terrain - IoU = 31.49
+# ===> manmade - IoU = 19.91
+# ===> vegetation - IoU = 19.31
+# ===> mIoU of 6019 samples: 30.31
+# {'mIoU': array([0.109, 0.439, 0.244, 0.419, 0.452, 0.187, 0.256, 0.257, 0.259,
+#        0.253, 0.318, 0.59 , 0.315, 0.347, 0.315, 0.199, 0.193, 0.835])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.094   |  0.105   |  0.107   |
+# |       barrier        |  0.411   |  0.460   |  0.480   |
+# |       bicycle        |  0.252   |  0.286   |  0.293   |
+# |         bus          |  0.541   |  0.646   |  0.698   |
+# |         car          |  0.520   |  0.594   |  0.621   |
+# | construction_vehicle |  0.164   |  0.235   |  0.264   |
+# |      motorcycle      |  0.212   |  0.305   |  0.321   |
+# |      pedestrian      |  0.326   |  0.373   |  0.389   |
+# |     traffic_cone     |  0.312   |  0.341   |  0.348   |
+# |       trailer        |  0.220   |  0.291   |  0.372   |
+# |        truck         |  0.430   |  0.520   |  0.565   |
+# |  driveable_surface   |  0.552   |  0.633   |  0.720   |
+# |      other_flat      |  0.293   |  0.330   |  0.361   |
+# |       sidewalk       |  0.242   |  0.291   |  0.340   |
+# |       terrain        |  0.236   |  0.305   |  0.369   |
+# |       manmade        |  0.303   |  0.378   |  0.429   |
+# |      vegetation      |  0.193   |  0.294   |  0.381   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.312   |  0.376   |  0.415   |
+# +----------------------+----------+----------+----------+
+# 6019it [09:13, 10.87it/s]
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.020  |  0.028  |  0.030  |
+# |       barrier        |  0.155  |  0.211  |  0.235  |
+# |       bicycle        |  0.083  |  0.097  |  0.102  |
+# |         bus          |  0.299  |  0.391  |  0.442  |
+# |         car          |  0.277  |  0.360  |  0.384  |
+# | construction_vehicle |  0.011  |  0.062  |  0.077  |
+# |      motorcycle      |  0.098  |  0.149  |  0.166  |
+# |      pedestrian      |  0.021  |  0.026  |  0.027  |
+# |     traffic_cone     |  0.052  |  0.069  |  0.071  |
+# |       trailer        |  0.043  |  0.062  |  0.071  |
+# |        truck         |  0.158  |  0.248  |  0.293  |
+# |  driveable_surface   |  0.440  |  0.559  |  0.680  |
+# |      other_flat      |  0.065  |  0.089  |  0.107  |
+# |       sidewalk       |  0.012  |  0.029  |  0.060  |
+# |       terrain        |  0.009  |  0.028  |  0.053  |
+# |       manmade        |  0.060  |  0.108  |  0.153  |
+# |      vegetation      |  0.001  |  0.029  |  0.111  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.106  |  0.150  |  0.180  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.3676099569727112, 'RayIoU@1': 0.3118578145261225, 'RayIoU@2': 0.3757836068619914, 'RayIoU@4': 0.4151884495300196, 
+#  'RayPQ': 0.14529917059571107, 'RayPQ@1': 0.1061843618020449, 'RayPQ@2': 0.14961373290314467, 'RayPQ@4': 0.18009941708194366}
--- a/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d.py
+++ b/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVDepth4DOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=512,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# ===> others - IoU = 9.99
+# ===> barrier - IoU = 41.3
+# ===> bicycle - IoU = 22.84
+# ===> bus - IoU = 41.17
+# ===> car - IoU = 41.89
+# ===> construction_vehicle - IoU = 20.84
+# ===> motorcycle - IoU = 25.25
+# ===> pedestrian - IoU = 23.98
+# ===> traffic_cone - IoU = 24.36
+# ===> trailer - IoU = 26.39
+# ===> truck - IoU = 30.41
+# ===> driveable_surface - IoU = 58.26
+# ===> other_flat - IoU = 31.86
+# ===> sidewalk - IoU = 34.47
+# ===> terrain - IoU = 31.96
+# ===> manmade - IoU = 18.87
+# ===> vegetation - IoU = 18.95
+# ===> mIoU of 6019 samples: 29.57
+# {'mIoU': array([0.1  , 0.413, 0.228, 0.412, 0.419, 0.208, 0.253, 0.24 , 0.244,
+#        0.264, 0.304, 0.583, 0.319, 0.345, 0.32 , 0.189, 0.189, 0.833])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.095   |  0.106   |  0.109   |
+# |       barrier        |  0.392   |  0.444   |  0.466   |
+# |       bicycle        |  0.236   |  0.279   |  0.287   |
+# |         bus          |  0.513   |  0.616   |  0.675   |
+# |         car          |  0.492   |  0.567   |  0.596   |
+# | construction_vehicle |  0.170   |  0.256   |  0.296   |
+# |      motorcycle      |  0.216   |  0.304   |  0.330   |
+# |      pedestrian      |  0.315   |  0.363   |  0.378   |
+# |     traffic_cone     |  0.280   |  0.315   |  0.323   |
+# |       trailer        |  0.210   |  0.294   |  0.397   |
+# |        truck         |  0.419   |  0.517   |  0.565   |
+# |  driveable_surface   |  0.540   |  0.621   |  0.708   |
+# |      other_flat      |  0.284   |  0.320   |  0.354   |
+# |       sidewalk       |  0.242   |  0.289   |  0.337   |
+# |       terrain        |  0.233   |  0.302   |  0.367   |
+# |       manmade        |  0.291   |  0.370   |  0.422   |
+# |      vegetation      |  0.190   |  0.290   |  0.376   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.301   |  0.368   |  0.411   |
+# +----------------------+----------+----------+----------+
+# {'RayIoU': 0.3599406945036808, 'RayIoU@1': 0.30094679699387594, 'RayIoU@2': 0.36785252629427645, 'RayIoU@4': 0.4110227602228899}
\ No newline at end of file
--- a/projects/mmdet3d_plugin/__init__.py
+++ b/projects/mmdet3d_plugin/__init__.py
+from .datasets import *
+from .core import *
+from .models import *
--- a/projects/mmdet3d_plugin/core/__init__.py
+++ b/projects/mmdet3d_plugin/core/__init__.py
+from .bbox import *
+from .hook import *
--- a/projects/mmdet3d_plugin/core/bbox/__init__.py
+++ b/projects/mmdet3d_plugin/core/bbox/__init__.py
+from .coders import *
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
+++ b/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
+from .centerpoint_bbox_coders import CenterPointBBoxCoder
+__all__ = ['CenterPointBBoxCoder']
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/bbox/coders/centerpoint_bbox_coders.py
+++ b/projects/mmdet3d_plugin/core/bbox/coders/centerpoint_bbox_coders.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+@BBOX_CODERS.register_module(force=True)
+class CenterPointBBoxCoder(BaseBBoxCoder):
+    """Bbox coder for CenterPoint.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        out_size_factor (int): Downsample factor of the model.
+        voxel_size (list[float]): Size of voxel.
+        post_center_range (list[float], optional): Limit of the center.
+            Default: None.
+        max_num (int, optional): Max number to be kept. Default: 100.
+        score_threshold (float, optional): Threshold to filter boxes
+            based on score. Default: None.
+        code_size (int, optional): Code size of bboxes. Default: 9
+    """
+    def __init__(self,
+                 pc_range,
+                 out_size_factor,
+                 voxel_size,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 code_size=9):
+        self.pc_range = pc_range    # [x_min, y_min, ...]
+        self.out_size_factor = out_size_factor
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range  # [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.code_size = code_size
+    def _gather_feat(self, feats, inds, feat_masks=None):
+        """Given feats and indexes, returns the gathered feats.
+        Args:
+            feats (torch.Tensor): Features to be transposed and gathered
+                with the shape of [B, 2, W, H].
+            inds (torch.Tensor): Indexes with the shape of [B, N].
+            feat_masks (torch.Tensor, optional): Mask of the feats.
+                Default: None.
+        Returns:
+            torch.Tensor: Gathered feats.
+        """
+        dim = feats.size(2)
+        inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim)
+        feats = feats.gather(1, inds)
+        if feat_masks is not None:
+            feat_masks = feat_masks.unsqueeze(2).expand_as(feats)
+            feats = feats[feat_masks]
+            feats = feats.view(-1, dim)
+        return feats
+    def _topk(self, scores, K=80):
+        """Get indexes based on scores.
+        Args:
+            scores (torch.Tensor): scores with the shape of (B, N_cls, H, W).
+            K (int, optional): Number to be kept. Defaults to 80.
+        Returns:
+            tuple[torch.Tensor]
+                torch.Tensor: Selected scores with the shape of [B, K].
+                torch.Tensor: Selected indexes with the shape of [B, K].
+                torch.Tensor: Selected classes with the shape of [B, K].
+                torch.Tensor: Selected y coord with the shape of [B, K].
+                torch.Tensor: Selected x coord with the shape of [B, K].
+        """
+        batch, cat, height, width = scores.size()
+        # 先是针对每个类别的预测都取topK.
+        # (B, N_cls, K), (B, N_cls, K)
+        topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+        topk_inds = topk_inds % (height * width)    # (B, N_cls, K), topK对应的像素索引(0, H*W-1).
+        topk_ys = (topk_inds.float() /
+                   torch.tensor(width, dtype=torch.float)).int().float()    # (B, N_cls, K), y坐标.
+        topk_xs = (topk_inds % width).int().float()     # (B, N_cls, K), x坐标.
+        # 然后对将所有类别得到的topK数据再次进行topK.
+        # (B, K), (B, K)
+        topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+        topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int()      # (B, K)  对应的类别.
+        # (B, N_cls*K, 1) --gather--> (B, K, 1) --> (B, K)  topK对应的像素坐标索引(0, H*W-1).
+        topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1),
+                                      topk_ind).view(batch, K)
+        # (B, N_cls*K, 1) --gather--> (B, K, 1) --> (B, K)  topK对应的y坐标.
+        topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1),
+                                    topk_ind).view(batch, K)
+        # (B, N_cls*K, 1) --gather--> (B, K, 1) --> (B, K)  topK对应的x坐标.
+        topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1),
+                                    topk_ind).view(batch, K)
+        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+    def _transpose_and_gather_feat(self, feat, ind):
+        """Given feats and indexes, returns the transposed and gathered feats.
+        Args:
+            feat (torch.Tensor): Features to be transposed and gathered
+                with the shape of (B, N_c, H, W).
+            ind (torch.Tensor): Indexes with the shape of [B, K].
+        Returns:
+            torch.Tensor: Transposed and gathered feats.
+        """
+        # (B, N_c, H, W) --> (B, H, W, N_c) --> (B, H*W, N_c)
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)     # (B, K, N_c)
+        return feat
+    def encode(self):
+        pass
+    def decode(self,
+               heat,
+               rot_sine,
+               rot_cosine,
+               hei,
+               dim,
+               vel,
+               reg=None,
+               task_id=-1):
+        """Decode bboxes.
+        Args:
+            heat (torch.Tensor): Heatmap with the shape of (B, N_cls, H, W).
+            rot_sine (torch.Tensor): Sine of rotation with the shape of (B, 1, H, W).
+            rot_cosine (torch.Tensor): Cosine of rotation with the shape of (B, 1, H, W).
+            hei (torch.Tensor): Height of the boxes with the shape of (B, 1, H, W).
+            dim (torch.Tensor): Dim of the boxes with the shape of (B, 3, H, W).
+            vel (torch.Tensor): Velocity with the shape of (B, 1, H, W).
+            reg (torch.Tensor, optional): Regression value of the boxes in
+                2D with the shape of (B, 2, H, W). Default: None.
+            task_id (int, optional): Index of task. Default: -1.
+        Returns:
+            list[dict]: Decoded boxes.  List[p_dict0, p_dict1, ...]
+                p_dict = {
+                    'bboxes': boxes3d,      # (K', 9)
+                    'scores': scores,       # (K', )
+                    'labels': labels        # (K', )
+                }
+        """
+        batch, cat, _, _ = heat.size()
+        # (B, K)
+        scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
+        if reg is not None:
+            reg = self._transpose_and_gather_feat(reg, inds)    # (B, K, 2)
+            reg = reg.view(batch, self.max_num, 2)
+            xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]    # (B, K, 1) + (B, K, 1) --> (B, K, 1)
+            ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]    # (B, K, 1) + (B, K, 1) --> (B, K, 1)
+        else:
+            xs = xs.view(batch, self.max_num, 1) + 0.5
+            ys = ys.view(batch, self.max_num, 1) + 0.5
+        # rotation value and direction label
+        rot_sine = self._transpose_and_gather_feat(rot_sine, inds)   # (B, K, 1)
+        rot_sine = rot_sine.view(batch, self.max_num, 1)
+        rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds)  # (B, K, 1)
+        rot_cosine = rot_cosine.view(batch, self.max_num, 1)
+        rot = torch.atan2(rot_sine, rot_cosine)     # (B, K, 1)
+        # height in the bev
+        hei = self._transpose_and_gather_feat(hei, inds)
+        hei = hei.view(batch, self.max_num, 1)      # (B, K, 1)
+        # dim of the box
+        dim = self._transpose_and_gather_feat(dim, inds)
+        dim = dim.view(batch, self.max_num, 3)      # (B, K, 3)
+        # class label
+        clses = clses.view(batch, self.max_num).float()     # (B, K)
+        scores = scores.view(batch, self.max_num)   # (B, K)
+        # 计算真实的bev坐标.
+        xs = xs.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
+        ys = ys.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
+        if vel is None:  # KITTI FORMAT
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2)
+        else:  # exist velocity, nuscene format
+            vel = self._transpose_and_gather_feat(vel, inds)    # (B, K, 2)
+            vel = vel.view(batch, self.max_num, 2)
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2)    # (B, K, 9)
+        final_scores = scores
+        final_preds = clses
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold   # (B, K)
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=heat.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(2)      # (B, K, 3) --> (B, K)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(2)     # (B, K, 3) --> (B, K)
+            predictions_dicts = []
+            for i in range(batch):
+                cmask = mask[i, :]      # (K, )
+                if self.score_threshold:
+                    cmask &= thresh_mask[i]     # (K, )
+                boxes3d = final_box_preds[i, cmask]     # (K', 9)
+                scores = final_scores[i, cmask]         # (K', )
+                labels = final_preds[i, cmask]          # (K', )
+                predictions_dict = {
+                    'bboxes': boxes3d,      # (K', 9)
+                    'scores': scores,       # (K', )
+                    'labels': labels        # (K', )
+                }
+                # List[p_dict0, p_dict1, ...]   len = batch_size
+                predictions_dicts.append(predictions_dict)
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dicts
+    def center_decode(self,
+               heat,
+               hei,
+               reg=None,
+               task_id=-1):
+        batch, cat, _, _ = heat.size()
+        # (B, K)
+        scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
+        if reg is not None:
+            reg = self._transpose_and_gather_feat(reg, inds)    # (B, K, 2)
+            reg = reg.view(batch, self.max_num, 2)
+            xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]    # (B, K, 1) + (B, K, 1) --> (B, K, 1)
+            ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]    # (B, K, 1) + (B, K, 1) --> (B, K, 1)
+        else:
+            xs = xs.view(batch, self.max_num, 1) + 0.5
+            ys = ys.view(batch, self.max_num, 1) + 0.5
+        # height in the bev
+        hei = self._transpose_and_gather_feat(hei, inds)
+        hei = hei.view(batch, self.max_num, 1)      # (B, K, 1)
+        # class label
+        clses = clses.view(batch, self.max_num).float()     # (B, K)
+        scores = scores.view(batch, self.max_num)   # (B, K)
+        # 计算真实的bev坐标.
+        xs = xs.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
+        ys = ys.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
+        final_center_preds = torch.cat([xs, ys, hei], dim=2)
+        final_scores = scores
+        final_preds = clses
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold   # (B, K)
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=heat.device)
+            mask = (final_center_preds[..., :3] >=
+                    self.post_center_range[:3]).all(2)      # (B, K, 3) --> (B, K)
+            mask &= (final_center_preds[..., :3] <=
+                     self.post_center_range[3:]).all(2)     # (B, K, 3) --> (B, K)
+            predictions_dicts = []
+            for i in range(batch):
+                cmask = mask[i, :]      # (K, )
+                if self.score_threshold:
+                    cmask &= thresh_mask[i]     # (K, )
+                centers = final_center_preds[i, cmask]     # (K', 9)
+                scores = final_scores[i, cmask]         # (K', )
+                labels = final_preds[i, cmask]          # (K', )
+                predictions_dict = {
+                    'centers': centers,      # (K', 9)
+                    'scores': scores,       # (K', )
+                    'labels': labels        # (K', )
+                }
+                # List[p_dict0, p_dict1, ...]   len = batch_size
+                predictions_dicts.append(predictions_dict)
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dicts