add code

d2b71343 · 雍大凯 · 69e57885 · d2b71343 · d2b71343 · c9541b0d
Commit d2b71343 authored Apr 08, 2026 by 雍大凯
20 changed files
--- a/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
+++ b/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
--- a/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
+++ b/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
--- a/mmdetection3d @ c9541b0d
+++ b/mmdetection3d @ c9541b0d
+Subproject commit c9541b0db89498fdea5cafd05b7b17f7b625b858
--- a/docker-hub/FlashOCC/Flashocc/projects/__init__.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/__init__.py
--- a/docker-hub/FlashOCC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
+++ b/docker-hub/FlashOCC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 32
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4],
+        stride=[1, 2, 2],
+        backbone_output_ids=[0, 1, 2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    pre_process=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        with_cp=False,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-4d-stereo-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with_pretrain:
+# align_after_view_transfromation=False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 8.22
+# ===> barrier - IoU = 44.21
+# ===> bicycle - IoU = 10.34
+# ===> bus - IoU = 42.08
+# ===> car - IoU = 49.63
+# ===> construction_vehicle - IoU = 23.37
+# ===> motorcycle - IoU = 17.41
+# ===> pedestrian - IoU = 21.49
+# ===> traffic_cone - IoU = 19.7
+# ===> trailer - IoU = 31.33
+# ===> truck - IoU = 37.09
+# ===> driveable_surface - IoU = 80.13
+# ===> other_flat - IoU = 37.37
+# ===> sidewalk - IoU = 50.41
+# ===> terrain - IoU = 54.29
+# ===> manmade - IoU = 45.56
+# ===> vegetation - IoU = 39.59
+# ===> mIoU of 6019 samples: 36.01
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 32
+
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4],
+        stride=[1, 2, 2],
+        backbone_output_ids=[0, 1, 2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with pretrain
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.65
+# ===> barrier - IoU = 36.97
+# ===> bicycle - IoU = 8.33
+# ===> bus - IoU = 38.69
+# ===> car - IoU = 44.46
+# ===> construction_vehicle - IoU = 15.21
+# ===> motorcycle - IoU = 13.67
+# ===> pedestrian - IoU = 16.39
+# ===> traffic_cone - IoU = 15.27
+# ===> trailer - IoU = 27.11
+# ===> truck - IoU = 31.04
+# ===> driveable_surface - IoU = 78.7
+# ===> other_flat - IoU = 36.45
+# ===> sidewalk - IoU = 48.27
+# ===> terrain - IoU = 51.68
+# ===> manmade - IoU = 36.82
+# ===> vegetation - IoU = 32.09
+# ===> mIoU of 6019 samples: 31.64
+
+
+# with det pretrain; use_mask=False; class_balance=True
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.36
+# ===> barrier - IoU = 28.87
+# ===> bicycle - IoU = 2.86
+# ===> bus - IoU = 29.27
+# ===> car - IoU = 32.45
+# ===> construction_vehicle - IoU = 11.05
+# ===> motorcycle - IoU = 12.82
+# ===> pedestrian - IoU = 10.11
+# ===> traffic_cone - IoU = 9.47
+# ===> trailer - IoU = 7.93
+# ===> truck - IoU = 21.58
+# ===> driveable_surface - IoU = 49.85
+# ===> other_flat - IoU = 25.5
+# ===> sidewalk - IoU = 26.78
+# ===> terrain - IoU = 21.14
+# ===> manmade - IoU = 5.76
+# ===> vegetation - IoU = 7.09
+# ===> mIoU of 6019 samples: 18.05
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+# Copyright (c) Phigent Robotics. All rights reserved.
+# align_after_view_transfromation=True
+
+# align_after_view_transfromation=False
+# 1x/12epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.12
+# ===> barrier - IoU = 48.06
+# ===> bicycle - IoU = 0.0
+# ===> bus - IoU = 51.19
+# ===> car - IoU = 53.61
+# ===> construction_vehicle - IoU = 27.15
+# ===> motorcycle - IoU = 2.74
+# ===> pedestrian - IoU = 28.3
+# ===> traffic_cone - IoU = 23.33
+# ===> trailer - IoU = 36.24
+# ===> truck - IoU = 42.13
+# ===> driveable_surface - IoU = 81.77
+# ===> other_flat - IoU = 42.43
+# ===> sidewalk - IoU = 53.67
+# ===> terrain - IoU = 57.31
+# ===> manmade - IoU = 48.27
+# ===> vegetation - IoU = 43.31
+# ===> mIoU of 6019 samples: 38.21
+
+# 2x/24epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 12.15
+# ===> barrier - IoU = 49.63
+# ===> bicycle - IoU = 25.1
+# ===> bus - IoU = 52.02
+# ===> car - IoU = 54.46
+# ===> construction_vehicle - IoU = 27.87
+# ===> motorcycle - IoU = 27.99
+# ===> pedestrian - IoU = 28.94
+# ===> traffic_cone - IoU = 27.23
+# ===> trailer - IoU = 36.43
+# ===> truck - IoU = 42.22
+# ===> driveable_surface - IoU = 82.31
+# ===> other_flat - IoU = 43.29
+# ===> sidewalk - IoU = 54.62
+# ===> terrain - IoU = 57.9
+# ===> manmade - IoU = 48.61
+# ===> vegetation - IoU = 43.55
+# ===> mIoU of 6019 samples: 42.02
+
+# 3x/36epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 12.37
+# ===> barrier - IoU = 50.15
+# ===> bicycle - IoU = 26.97
+# ===> bus - IoU = 51.86
+# ===> car - IoU = 54.65
+# ===> construction_vehicle - IoU = 28.38
+# ===> motorcycle - IoU = 28.96
+# ===> pedestrian - IoU = 29.02
+# ===> traffic_cone - IoU = 28.28
+# ===> trailer - IoU = 37.05
+# ===> truck - IoU = 42.52
+# ===> driveable_surface - IoU = 82.55
+# ===> other_flat - IoU = 43.15
+# ===> sidewalk - IoU = 54.87
+# ===> terrain - IoU = 58.33
+# ===> manmade - IoU = 48.78
+# ===> vegetation - IoU = 43.79
+# ===> mIoU of 6019 samples: 42.45
+
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 32
+
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans,numC_Trans*2,numC_Trans*4],
+        stride=[1,2,2],
+        backbone_output_ids=[0,1,2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    pre_process=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        with_cp=False,
+        num_layer=[1,],
+        num_channels=[numC_Trans,],
+        stride=[1,],
+        backbone_output_ids=[0,]),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=1,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=2e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24,])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+
+load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams': 6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        pretrained='torchvision://resnet50',
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "./ckpts/bevdet-r50-4d-stereo-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with_pretrain:
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 9.08
+# ===> barrier - IoU = 46.32
+# ===> bicycle - IoU = 17.71
+# ===> bus - IoU = 42.7
+# ===> car - IoU = 50.64
+# ===> construction_vehicle - IoU = 23.72
+# ===> motorcycle - IoU = 20.13
+# ===> pedestrian - IoU = 22.34
+# ===> traffic_cone - IoU = 24.09
+# ===> trailer - IoU = 30.26
+# ===> truck - IoU = 37.39
+# ===> driveable_surface - IoU = 81.68
+# ===> other_flat - IoU = 40.13
+# ===> sidewalk - IoU = 52.34
+# ===> terrain - IoU = 56.46
+# ===> manmade - IoU = 47.69
+# ===> vegetation - IoU = 40.6
+# ===> mIoU of 6019 samples: 37.84
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
+_base_ = ['./flashocc-r50-M0.py',
+          ]
+
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 64
+
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with det pretrain; use_mask=True; out_dim=256,
+# ===> per class IoU of 6019 samples:
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.21
+# ===> barrier - IoU = 39.56
+# ===> bicycle - IoU = 11.27
+# ===> bus - IoU = 36.31
+# ===> car - IoU = 43.96
+# ===> construction_vehicle - IoU = 16.25
+# ===> motorcycle - IoU = 14.74
+# ===> pedestrian - IoU = 16.89
+# ===> traffic_cone - IoU = 15.76
+# ===> trailer - IoU = 28.56
+# ===> truck - IoU = 30.91
+# ===> driveable_surface - IoU = 78.16
+# ===> other_flat - IoU = 37.52
+# ===> sidewalk - IoU = 47.42
+# ===> terrain - IoU = 51.35
+# ===> manmade - IoU = 36.79
+# ===> vegetation - IoU = 31.42
+# ===> mIoU of 6019 samples: 31.95
+# {'mIoU': array([0.06207982, 0.39564533, 0.11270112, 0.36311426, 0.43955401,
+#        0.16252583, 0.14739984, 0.16885096, 0.15757262, 0.28564777,
+#        0.30909029, 0.7815907 , 0.37523904, 0.47420705, 0.51351759,
+#        0.36789645, 0.31420157, 0.87802724])}
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
+_base_ = ['./flashocc-r50.py',
+          ]
+
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 64
+
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        #pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,    # out_dim=128 for M0!!!
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=24,
+    workers_per_gpu=24,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with det pretrain; use_mask=True;
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.74
+# ===> barrier - IoU = 37.65
+# ===> bicycle - IoU = 10.26
+# ===> bus - IoU = 39.55
+# ===> car - IoU = 44.36
+# ===> construction_vehicle - IoU = 14.88
+# ===> motorcycle - IoU = 13.4
+# ===> pedestrian - IoU = 15.79
+# ===> traffic_cone - IoU = 15.38
+# ===> trailer - IoU = 27.44
+# ===> truck - IoU = 31.73
+# ===> driveable_surface - IoU = 78.82
+# ===> other_flat - IoU = 37.98
+# ===> sidewalk - IoU = 48.7
+# ===> terrain - IoU = 52.5
+# ===> manmade - IoU = 37.89
+# ===> vegetation - IoU = 32.24
+# ===> mIoU of 6019 samples: 32.08
+
+# with det pretrain; use_mask=False; class_balance=True
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.49
+# ===> barrier - IoU = 29.59
+# ===> bicycle - IoU = 7.38
+# ===> bus - IoU = 30.32
+# ===> car - IoU = 32.22
+# ===> construction_vehicle - IoU = 13.04
+# ===> motorcycle - IoU = 11.91
+# ===> pedestrian - IoU = 8.61
+# ===> traffic_cone - IoU = 8.11
+# ===> trailer - IoU = 7.66
+# ===> truck - IoU = 20.84
+# ===> driveable_surface - IoU = 48.59
+# ===> other_flat - IoU = 26.62
+# ===> sidewalk - IoU = 26.08
+# ===> terrain - IoU = 20.86
+# ===> manmade - IoU = 7.62
+# ===> vegetation - IoU = 7.14
+# ===> mIoU of 6019 samples: 18.3
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 80
+
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        with_cp=True,
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    data_root=data_root,
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+
+evaluation = dict(interval=6, start=0, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=3)
+# load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+resume_from="work_dirs/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2/epoch_5.pth"
+# fp16 = dict(loss_scale='dynamic')
+
+# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py 4
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 80
+
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        with_cp=True,
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_wise=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    data_root=data_root,
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=2e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+
+evaluation = dict(interval=6, start=0, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=3)
+load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+
+# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408.py 4
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 13.42
+# ===> barrier - IoU = 51.07
+# ===> bicycle - IoU = 27.68
+# ===> bus - IoU = 51.57
+# ===> car - IoU = 56.22
+# ===> construction_vehicle - IoU = 27.27
+# ===> motorcycle - IoU = 29.98
+# ===> pedestrian - IoU = 29.93
+# ===> traffic_cone - IoU = 29.8
+# ===> trailer - IoU = 37.77
+# ===> truck - IoU = 43.52
+# ===> driveable_surface - IoU = 83.81
+# ===> other_flat - IoU = 46.55
+# ===> sidewalk - IoU = 56.15
+# ===> terrain - IoU = 59.56
+# ===> manmade - IoU = 50.84
+# ===> vegetation - IoU = 44.67
+# ===> mIoU of 6019 samples: 43.52
+
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 13.31
+# ===> barrier - IoU = 51.62
+# ===> bicycle - IoU = 28.07
+# ===> bus - IoU = 50.91
+# ===> car - IoU = 55.69
+# ===> construction_vehicle - IoU = 27.46
+# ===> motorcycle - IoU = 31.05
+# ===> pedestrian - IoU = 29.98
+# ===> traffic_cone - IoU = 29.2
+# ===> trailer - IoU = 38.86
+# ===> truck - IoU = 43.68
+# ===> driveable_surface - IoU = 83.87
+# ===> other_flat - IoU = 45.63
+# ===> sidewalk - IoU = 56.33
+# ===> terrain - IoU = 59.01
+# ===> manmade - IoU = 50.63
+# ===> vegetation - IoU = 44.56
+# ===> mIoU of 6019 samples: 43.52
+# {'mIoU': array([0.13311691, 0.51617081, 0.28070517, 0.50911942, 0.55694228,
+#        0.27461342, 0.31050779, 0.29979125, 0.29204287, 0.38862984,
+#        0.43680049, 0.83872518, 0.45630227, 0.56327839, 0.59008883,
+#        0.50627122, 0.44564523, 0.90959399])}
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+_base_ = ['./flashoccv2-r50-depth-tiny-pano.py',
+          ]
+
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+_base_ = ['./flashoccv2-r50-depth.py',
+          ]
+
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)