Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishj6
Flashocc
Commits
3b8d508a
Commit
3b8d508a
authored
Sep 05, 2025
by
lishj6
🏸
Browse files
init_0905
parent
e968ab0f
Pipeline
#2906
canceled with stages
Changes
156
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4372 additions
and
0 deletions
+4372
-0
projects/configs/flashocc/flashocc-r50.py
projects/configs/flashocc/flashocc-r50.py
+268
-0
projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
...s/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+260
-0
projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
...s/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+302
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
...igs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+356
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
...tic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+7
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
...anoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+357
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
...igs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+279
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
...figs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+7
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth.py
.../configs/panoptic-flashocc/panoptic-flashocc-r50-depth.py
+336
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f-pano.py
...lashocc/panoptic-flashocc-r50-depth4d-longterm16f-pano.py
+294
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f.py
...tic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f.py
+283
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f-pano.py
...flashocc/panoptic-flashocc-r50-depth4d-longterm8f-pano.py
+368
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f.py
...ptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f.py
+283
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-pano.py
...s/panoptic-flashocc/panoptic-flashocc-r50-depth4d-pano.py
+367
-0
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d.py
...onfigs/panoptic-flashocc/panoptic-flashocc-r50-depth4d.py
+280
-0
projects/mmdet3d_plugin/__init__.py
projects/mmdet3d_plugin/__init__.py
+3
-0
projects/mmdet3d_plugin/core/__init__.py
projects/mmdet3d_plugin/core/__init__.py
+2
-0
projects/mmdet3d_plugin/core/bbox/__init__.py
projects/mmdet3d_plugin/core/bbox/__init__.py
+1
-0
projects/mmdet3d_plugin/core/bbox/coders/__init__.py
projects/mmdet3d_plugin/core/bbox/coders/__init__.py
+3
-0
projects/mmdet3d_plugin/core/bbox/coders/centerpoint_bbox_coders.py
...mdet3d_plugin/core/bbox/coders/centerpoint_bbox_coders.py
+316
-0
No files found.
projects/configs/flashocc/flashocc-r50.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDetOCC'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformer'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
# out_dim=128 for M0!!!
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
34
,
workers_per_gpu
=
34
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
30
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# with det pretrain; use_mask=True;
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 6.74
# ===> barrier - IoU = 37.65
# ===> bicycle - IoU = 10.26
# ===> bus - IoU = 39.55
# ===> car - IoU = 44.36
# ===> construction_vehicle - IoU = 14.88
# ===> motorcycle - IoU = 13.4
# ===> pedestrian - IoU = 15.79
# ===> traffic_cone - IoU = 15.38
# ===> trailer - IoU = 27.44
# ===> truck - IoU = 31.73
# ===> driveable_surface - IoU = 78.82
# ===> other_flat - IoU = 37.98
# ===> sidewalk - IoU = 48.7
# ===> terrain - IoU = 52.5
# ===> manmade - IoU = 37.89
# ===> vegetation - IoU = 32.24
# ===> mIoU of 6019 samples: 32.08
# with det pretrain; use_mask=False; class_balance=True
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 4.49
# ===> barrier - IoU = 29.59
# ===> bicycle - IoU = 7.38
# ===> bus - IoU = 30.32
# ===> car - IoU = 32.22
# ===> construction_vehicle - IoU = 13.04
# ===> motorcycle - IoU = 11.91
# ===> pedestrian - IoU = 8.61
# ===> traffic_cone - IoU = 8.11
# ===> trailer - IoU = 7.66
# ===> truck - IoU = 20.84
# ===> driveable_surface - IoU = 48.59
# ===> other_flat - IoU = 26.62
# ===> sidewalk - IoU = 26.08
# ===> terrain - IoU = 20.86
# ===> manmade - IoU = 7.62
# ===> vegetation - IoU = 7.14
# ===> mIoU of 6019 samples: 18.3
projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
512
,
1408
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
# Model
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
type
=
'SwinTransformer'
,
pretrain_img_size
=
224
,
patch_size
=
4
,
window_size
=
12
,
mlp_ratio
=
4
,
embed_dims
=
128
,
depths
=
[
2
,
2
,
18
,
2
],
num_heads
=
[
4
,
8
,
16
,
32
],
strides
=
(
4
,
2
,
2
,
2
),
out_indices
=
(
2
,
3
),
qkv_bias
=
True
,
qk_scale
=
None
,
patch_norm
=
True
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.1
,
use_abs_pos_embed
=
False
,
return_stereo_feat
=
True
,
act_cfg
=
dict
(
type
=
'GELU'
),
norm_cfg
=
dict
(
type
=
'LN'
,
requires_grad
=
True
),
pretrain_style
=
'official'
,
output_missing_index_as_none
=
False
),
img_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
512
+
1024
,
out_channels
=
512
,
# with_cp=False,
extra_upsample
=
None
,
input_feature_index
=
(
0
,
1
),
scale_factor
=
2
),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
with_cp
=
True
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
data_root
=
data_root
,
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
# with 32 GPU
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
dict
(
type
=
'SyncbnControlHook'
,
syncbn_start_epoch
=
0
,
),
]
evaluation
=
dict
(
interval
=
6
,
start
=
0
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
)
# load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
resume_from
=
"work_dirs/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2/epoch_5.pth"
# fp16 = dict(loss_scale='dynamic')
# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py 4
\ No newline at end of file
projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
512
,
1408
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
# Model
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVStereo4DOCC'
,
align_after_view_transfromation
=
False
,
num_adj
=
len
(
range
(
*
multi_adj_frame_id_cfg
)),
img_backbone
=
dict
(
type
=
'SwinTransformer'
,
pretrain_img_size
=
224
,
patch_size
=
4
,
window_size
=
12
,
mlp_ratio
=
4
,
embed_dims
=
128
,
depths
=
[
2
,
2
,
18
,
2
],
num_heads
=
[
4
,
8
,
16
,
32
],
strides
=
(
4
,
2
,
2
,
2
),
out_indices
=
(
2
,
3
),
qkv_bias
=
True
,
qk_scale
=
None
,
patch_norm
=
True
,
drop_rate
=
0.
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.1
,
use_abs_pos_embed
=
False
,
return_stereo_feat
=
True
,
act_cfg
=
dict
(
type
=
'GELU'
),
norm_cfg
=
dict
(
type
=
'LN'
,
requires_grad
=
True
),
pretrain_style
=
'official'
,
output_missing_index_as_none
=
False
),
img_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
512
+
1024
,
out_channels
=
512
,
# with_cp=False,
extra_upsample
=
None
,
input_feature_index
=
(
0
,
1
),
scale_factor
=
2
),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVStereo'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
sid
=
False
,
collapse_z
=
True
,
loss_depth_weight
=
0.05
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
,
stereo
=
True
,
bias
=
5.
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
with_cp
=
True
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
occ_head
=
dict
(
type
=
'BEVOCCHead2D'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
True
,
num_classes
=
18
,
use_predicter
=
True
,
class_wise
=
False
,
loss_occ
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
ignore_index
=
255
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
True
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
data_root
=
data_root
,
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
# with 32 GPU
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
2e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
dict
(
type
=
'SyncbnControlHook'
,
syncbn_start_epoch
=
0
,
),
]
evaluation
=
dict
(
interval
=
6
,
start
=
0
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
)
load_from
=
"ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408.py 4
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 13.42
# ===> barrier - IoU = 51.07
# ===> bicycle - IoU = 27.68
# ===> bus - IoU = 51.57
# ===> car - IoU = 56.22
# ===> construction_vehicle - IoU = 27.27
# ===> motorcycle - IoU = 29.98
# ===> pedestrian - IoU = 29.93
# ===> traffic_cone - IoU = 29.8
# ===> trailer - IoU = 37.77
# ===> truck - IoU = 43.52
# ===> driveable_surface - IoU = 83.81
# ===> other_flat - IoU = 46.55
# ===> sidewalk - IoU = 56.15
# ===> terrain - IoU = 59.56
# ===> manmade - IoU = 50.84
# ===> vegetation - IoU = 44.67
# ===> mIoU of 6019 samples: 43.52
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 13.31
# ===> barrier - IoU = 51.62
# ===> bicycle - IoU = 28.07
# ===> bus - IoU = 50.91
# ===> car - IoU = 55.69
# ===> construction_vehicle - IoU = 27.46
# ===> motorcycle - IoU = 31.05
# ===> pedestrian - IoU = 29.98
# ===> traffic_cone - IoU = 29.2
# ===> trailer - IoU = 38.86
# ===> truck - IoU = 43.68
# ===> driveable_surface - IoU = 83.87
# ===> other_flat - IoU = 45.63
# ===> sidewalk - IoU = 56.33
# ===> terrain - IoU = 59.01
# ===> manmade - IoU = 50.63
# ===> vegetation - IoU = 44.56
# ===> mIoU of 6019 samples: 43.52
# {'mIoU': array([0.13311691, 0.51617081, 0.28070517, 0.50911942, 0.55694228,
# 0.27461342, 0.31050779, 0.29979125, 0.29204287, 0.38862984,
# 0.43680049, 0.83872518, 0.45630227, 0.56327839, 0.59008883,
# 0.50627122, 0.44564523, 0.90959399])}
\ No newline at end of file
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
model
=
dict
(
type
=
'BEVDepthPano'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
256
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.21
# ===> barrier - IoU = 42.14
# ===> bicycle - IoU = 22.82
# ===> bus - IoU = 40.13
# ===> car - IoU = 42.86
# ===> construction_vehicle - IoU = 20.69
# ===> motorcycle - IoU = 24.58
# ===> pedestrian - IoU = 23.7
# ===> traffic_cone - IoU = 24.02
# ===> trailer - IoU = 25.48
# ===> truck - IoU = 30.9
# ===> driveable_surface - IoU = 58.65
# ===> other_flat - IoU = 32.04
# ===> sidewalk - IoU = 34.27
# ===> terrain - IoU = 31.12
# ===> manmade - IoU = 18.26
# ===> vegetation - IoU = 17.79
# ===> mIoU of 6019 samples: 29.39
# {'mIoU': array([0.102, 0.421, 0.228, 0.401, 0.429, 0.207, 0.246, 0.237, 0.24 ,
# 0.255, 0.309, 0.586, 0.32 , 0.343, 0.311, 0.183, 0.178, 0.833])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.090 | 0.102 | 0.105 |
# | barrier | 0.387 | 0.442 | 0.465 |
# | bicycle | 0.218 | 0.257 | 0.265 |
# | bus | 0.514 | 0.613 | 0.669 |
# | car | 0.487 | 0.564 | 0.592 |
# | construction_vehicle | 0.176 | 0.254 | 0.288 |
# | motorcycle | 0.203 | 0.292 | 0.310 |
# | pedestrian | 0.301 | 0.349 | 0.366 |
# | traffic_cone | 0.280 | 0.313 | 0.321 |
# | trailer | 0.227 | 0.313 | 0.390 |
# | truck | 0.395 | 0.493 | 0.537 |
# | driveable_surface | 0.534 | 0.618 | 0.708 |
# | other_flat | 0.289 | 0.326 | 0.356 |
# | sidewalk | 0.234 | 0.280 | 0.329 |
# | terrain | 0.222 | 0.291 | 0.356 |
# | manmade | 0.280 | 0.351 | 0.401 |
# | vegetation | 0.176 | 0.273 | 0.359 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.295 | 0.361 | 0.401 |
# +----------------------+----------+----------+----------+
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.017 | 0.025 | 0.026 |
# | barrier | 0.125 | 0.182 | 0.218 |
# | bicycle | 0.051 | 0.072 | 0.076 |
# | bus | 0.275 | 0.366 | 0.422 |
# | car | 0.242 | 0.332 | 0.356 |
# | construction_vehicle | 0.016 | 0.058 | 0.092 |
# | motorcycle | 0.071 | 0.124 | 0.137 |
# | pedestrian | 0.017 | 0.022 | 0.023 |
# | traffic_cone | 0.032 | 0.040 | 0.044 |
# | trailer | 0.035 | 0.055 | 0.063 |
# | truck | 0.145 | 0.232 | 0.282 |
# | driveable_surface | 0.410 | 0.537 | 0.665 |
# | other_flat | 0.062 | 0.087 | 0.109 |
# | sidewalk | 0.008 | 0.030 | 0.064 |
# | terrain | 0.010 | 0.026 | 0.047 |
# | manmade | 0.054 | 0.091 | 0.134 |
# | vegetation | 0.003 | 0.022 | 0.092 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.092 | 0.135 | 0.168 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.35223182059688496, 'RayIoU@1': 0.29499743138394385, 'RayIoU@2': 0.3607063492639709, 'RayIoU@4': 0.4009916811427401,
# 'RayPQ': 0.13182524545677765, 'RayPQ@1': 0.09247682620339576, 'RayPQ@2': 0.1354024129684159, 'RayPQ@4': 0.16759649719852124}
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'./flashoccv2-r50-depth-tiny-pano.py'
,
]
model
=
dict
(
wocc
=
True
,
wdet3d
=
False
,
)
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
1.0
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDepthPano'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
128
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
128
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
128
,
out_dim
=
128
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.33
# ===> barrier - IoU = 41.02
# ===> bicycle - IoU = 22.16
# ===> bus - IoU = 39.75
# ===> car - IoU = 42.63
# ===> construction_vehicle - IoU = 20.53
# ===> motorcycle - IoU = 24.01
# ===> pedestrian - IoU = 23.71
# ===> traffic_cone - IoU = 24.65
# ===> trailer - IoU = 25.58
# ===> truck - IoU = 30.63
# ===> driveable_surface - IoU = 58.0
# ===> other_flat - IoU = 32.12
# ===> sidewalk - IoU = 33.78
# ===> terrain - IoU = 31.02
# ===> manmade - IoU = 17.67
# ===> vegetation - IoU = 17.74
# ===> mIoU of 6019 samples: 29.14
# {'mIoU': array([0.103, 0.41 , 0.222, 0.397, 0.426, 0.205, 0.24 , 0.237, 0.246,
# 0.256, 0.306, 0.58 , 0.321, 0.338, 0.31 , 0.177, 0.177, 0.832])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.095 | 0.107 | 0.110 |
# | barrier | 0.374 | 0.429 | 0.452 |
# | bicycle | 0.208 | 0.242 | 0.248 |
# | bus | 0.498 | 0.603 | 0.659 |
# | car | 0.489 | 0.568 | 0.598 |
# | construction_vehicle | 0.171 | 0.247 | 0.279 |
# | motorcycle | 0.190 | 0.277 | 0.298 |
# | pedestrian | 0.295 | 0.344 | 0.361 |
# | traffic_cone | 0.290 | 0.324 | 0.332 |
# | trailer | 0.207 | 0.292 | 0.368 |
# | truck | 0.411 | 0.507 | 0.551 |
# | driveable_surface | 0.531 | 0.614 | 0.704 |
# | other_flat | 0.286 | 0.325 | 0.357 |
# | sidewalk | 0.234 | 0.280 | 0.328 |
# | terrain | 0.220 | 0.290 | 0.356 |
# | manmade | 0.267 | 0.343 | 0.392 |
# | vegetation | 0.174 | 0.272 | 0.358 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.291 | 0.357 | 0.397 |
# +----------------------+----------+----------+----------+
# 6019it [09:34, 10.48it/s]
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.017 | 0.024 | 0.025 |
# | barrier | 0.107 | 0.169 | 0.204 |
# | bicycle | 0.069 | 0.086 | 0.088 |
# | bus | 0.244 | 0.350 | 0.408 |
# | car | 0.238 | 0.326 | 0.352 |
# | construction_vehicle | 0.018 | 0.081 | 0.105 |
# | motorcycle | 0.061 | 0.105 | 0.117 |
# | pedestrian | 0.016 | 0.022 | 0.023 |
# | traffic_cone | 0.030 | 0.049 | 0.052 |
# | trailer | 0.029 | 0.047 | 0.056 |
# | truck | 0.151 | 0.240 | 0.286 |
# | driveable_surface | 0.407 | 0.531 | 0.662 |
# | other_flat | 0.054 | 0.078 | 0.098 |
# | sidewalk | 0.009 | 0.030 | 0.061 |
# | terrain | 0.006 | 0.022 | 0.045 |
# | manmade | 0.044 | 0.091 | 0.128 |
# | vegetation | 0.001 | 0.021 | 0.091 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.088 | 0.134 | 0.165 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.34819957391233375, 'RayIoU@1': 0.29065973127346445, 'RayIoU@2': 0.3566749015912661, 'RayIoU@4': 0.39726408887227066,
# 'RayPQ': 0.12890890185841564, 'RayPQ@1': 0.08832135839934552, 'RayPQ@2': 0.1336058084882046, 'RayPQ@4': 0.1647995386876968}
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
1.0
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
64
model
=
dict
(
type
=
'BEVDepthOCC'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
128
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
128
,
out_dim
=
128
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.69
# ===> barrier - IoU = 39.67
# ===> bicycle - IoU = 22.01
# ===> bus - IoU = 39.99
# ===> car - IoU = 40.46
# ===> construction_vehicle - IoU = 20.44
# ===> motorcycle - IoU = 24.52
# ===> pedestrian - IoU = 22.5
# ===> traffic_cone - IoU = 23.72
# ===> trailer - IoU = 25.93
# ===> truck - IoU = 29.75
# ===> driveable_surface - IoU = 58.29
# ===> other_flat - IoU = 31.46
# ===> sidewalk - IoU = 33.92
# ===> terrain - IoU = 31.25
# ===> manmade - IoU = 17.46
# ===> vegetation - IoU = 17.97
# ===> mIoU of 6019 samples: 28.83
# {'mIoU': array([0.1068576 , 0.3967071 , 0.220114 , 0.3998965 , 0.40462457,
# 0.20442682, 0.24516316, 0.22497209, 0.23719173, 0.25925541,
# 0.29754347, 0.58293305, 0.31458314, 0.33921965, 0.31254221,
# 0.17456574, 0.17970859, 0.8315865 ])}
# Starting Evaluation...
# 6019it [10:23, 9.65it/s]
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.094 | 0.107 | 0.111 |
# | barrier | 0.367 | 0.421 | 0.443 |
# | bicycle | 0.209 | 0.251 | 0.261 |
# | bus | 0.498 | 0.601 | 0.665 |
# | car | 0.472 | 0.550 | 0.581 |
# | construction_vehicle | 0.175 | 0.251 | 0.287 |
# | motorcycle | 0.205 | 0.292 | 0.315 |
# | pedestrian | 0.289 | 0.339 | 0.354 |
# | traffic_cone | 0.276 | 0.302 | 0.314 |
# | trailer | 0.203 | 0.289 | 0.380 |
# | truck | 0.396 | 0.493 | 0.546 |
# | driveable_surface | 0.528 | 0.611 | 0.702 |
# | other_flat | 0.280 | 0.315 | 0.346 |
# | sidewalk | 0.233 | 0.279 | 0.328 |
# | terrain | 0.218 | 0.286 | 0.353 |
# | manmade | 0.268 | 0.347 | 0.398 |
# | vegetation | 0.174 | 0.272 | 0.358 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.287 | 0.353 | 0.397 |
# +----------------------+----------+----------+----------+
# {'RayIoU': 0.34574739050176573, 'RayIoU@1': 0.2873820616941079, 'RayIoU@2': 0.3533573712072785,
# 'RayIoU@4': 0.39650273860391083}
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'./flashoccv2-r50-depth.py'
,
]
model
=
dict
(
wocc
=
True
,
wdet3d
=
False
,
)
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
model
=
dict
(
type
=
'BEVDepthPano'
,
# single-frame
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
256
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
256
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
256
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
False
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet'
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.090 | 0.102 | 0.105 |
# | barrier | 0.387 | 0.442 | 0.465 |
# | bicycle | 0.218 | 0.257 | 0.265 |
# | bus | 0.514 | 0.613 | 0.669 |
# | car | 0.487 | 0.564 | 0.592 |
# | construction_vehicle | 0.176 | 0.254 | 0.288 |
# | motorcycle | 0.203 | 0.292 | 0.310 |
# | pedestrian | 0.301 | 0.349 | 0.366 |
# | traffic_cone | 0.280 | 0.313 | 0.321 |
# | trailer | 0.227 | 0.313 | 0.390 |
# | truck | 0.395 | 0.493 | 0.537 |
# | driveable_surface | 0.534 | 0.618 | 0.708 |
# | other_flat | 0.289 | 0.326 | 0.356 |
# | sidewalk | 0.234 | 0.280 | 0.329 |
# | terrain | 0.222 | 0.291 | 0.356 |
# | manmade | 0.280 | 0.351 | 0.401 |
# | vegetation | 0.176 | 0.273 | 0.359 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.295 | 0.361 | 0.401 |
# +----------------------+----------+----------+----------+
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.017 | 0.025 | 0.026 |
# | barrier | 0.125 | 0.182 | 0.218 |
# | bicycle | 0.051 | 0.072 | 0.076 |
# | bus | 0.275 | 0.366 | 0.422 |
# | car | 0.242 | 0.332 | 0.356 |
# | construction_vehicle | 0.016 | 0.058 | 0.092 |
# | motorcycle | 0.071 | 0.124 | 0.137 |
# | pedestrian | 0.017 | 0.022 | 0.023 |
# | traffic_cone | 0.032 | 0.040 | 0.044 |
# | trailer | 0.035 | 0.055 | 0.063 |
# | truck | 0.145 | 0.232 | 0.282 |
# | driveable_surface | 0.410 | 0.537 | 0.665 |
# | other_flat | 0.062 | 0.087 | 0.109 |
# | sidewalk | 0.008 | 0.030 | 0.064 |
# | terrain | 0.010 | 0.026 | 0.047 |
# | manmade | 0.054 | 0.091 | 0.134 |
# | vegetation | 0.003 | 0.022 | 0.092 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.092 | 0.135 | 0.168 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.35223182059688496, 'RayIoU@1': 0.29499743138394385, 'RayIoU@2': 0.3607063492639709, 'RayIoU@4': 0.4009916811427401, 'RayPQ': 0.13182524545677765, 'RayPQ@1': 0.09247682620339576, 'RayPQ@2': 0.1354024129684159, 'RayPQ@4': 0.16759649719852124}
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f-pano.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
16
+
1
,
1
)
model
=
dict
(
type
=
'BEVDepth4DPano'
,
num_adj
=
multi_adj_frame_id_cfg
[
1
]
-
1
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
512
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
256
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.1
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm16f.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
16
+
1
,
1
)
model
=
dict
(
type
=
'BEVDepth4DOCC'
,
num_adj
=
multi_adj_frame_id_cfg
[
1
]
-
1
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
512
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 11.94
# ===> barrier - IoU = 44.84
# ===> bicycle - IoU = 26.66
# ===> bus - IoU = 41.53
# ===> car - IoU = 44.42
# ===> construction_vehicle - IoU = 20.79
# ===> motorcycle - IoU = 26.96
# ===> pedestrian - IoU = 25.98
# ===> traffic_cone - IoU = 29.25
# ===> trailer - IoU = 24.24
# ===> truck - IoU = 32.28
# ===> driveable_surface - IoU = 60.5
# ===> other_flat - IoU = 33.07
# ===> sidewalk - IoU = 37.01
# ===> terrain - IoU = 33.54
# ===> manmade - IoU = 21.75
# ===> vegetation - IoU = 21.58
# ===> mIoU of 6019 samples: 31.55
# {'mIoU': array([0.119, 0.448, 0.267, 0.415, 0.444, 0.208, 0.27 , 0.26 , 0.293,
# 0.242, 0.323, 0.605, 0.331, 0.37 , 0.335, 0.217, 0.216, 0.839])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.110 | 0.118 | 0.119 |
# | barrier | 0.444 | 0.484 | 0.499 |
# | bicycle | 0.278 | 0.311 | 0.319 |
# | bus | 0.537 | 0.635 | 0.691 |
# | car | 0.512 | 0.585 | 0.611 |
# | construction_vehicle | 0.153 | 0.218 | 0.238 |
# | motorcycle | 0.228 | 0.310 | 0.330 |
# | pedestrian | 0.338 | 0.387 | 0.401 |
# | traffic_cone | 0.342 | 0.362 | 0.370 |
# | trailer | 0.209 | 0.293 | 0.368 |
# | truck | 0.422 | 0.511 | 0.555 |
# | driveable_surface | 0.570 | 0.653 | 0.742 |
# | other_flat | 0.301 | 0.340 | 0.375 |
# | sidewalk | 0.266 | 0.319 | 0.370 |
# | terrain | 0.261 | 0.334 | 0.400 |
# | manmade | 0.360 | 0.435 | 0.485 |
# | vegetation | 0.244 | 0.354 | 0.442 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.328 | 0.391 | 0.430 |
# +----------------------+----------+----------+----------+
# {'RayIoU': 0.38313147213727416, 'RayIoU@1': 0.3279517851047602, 'RayIoU@2': 0.3911038935232673, 'RayIoU@4': 0.4303387377837949}
\ No newline at end of file
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f-pano.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
8
+
1
,
1
)
model
=
dict
(
type
=
'BEVDepth4DPano'
,
num_adj
=
multi_adj_frame_id_cfg
[
1
]
-
1
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
512
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
256
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 11.51
# ===> barrier - IoU = 45.87
# ===> bicycle - IoU = 24.65
# ===> bus - IoU = 41.75
# ===> car - IoU = 46.15
# ===> construction_vehicle - IoU = 20.96
# ===> motorcycle - IoU = 26.82
# ===> pedestrian - IoU = 26.77
# ===> traffic_cone - IoU = 29.66
# ===> trailer - IoU = 24.65
# ===> truck - IoU = 32.75
# ===> driveable_surface - IoU = 60.39
# ===> other_flat - IoU = 32.87
# ===> sidewalk - IoU = 36.49
# ===> terrain - IoU = 33.16
# ===> manmade - IoU = 21.3
# ===> vegetation - IoU = 20.92
# ===> mIoU of 6019 samples: 31.57
# {'mIoU': array([0.115, 0.459, 0.247, 0.418, 0.461, 0.21 , 0.268, 0.268, 0.297,
# 0.247, 0.328, 0.604, 0.329, 0.365, 0.332, 0.213, 0.209, 0.839])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.101 | 0.108 | 0.110 |
# | barrier | 0.439 | 0.480 | 0.497 |
# | bicycle | 0.258 | 0.286 | 0.293 |
# | bus | 0.540 | 0.649 | 0.700 |
# | car | 0.531 | 0.603 | 0.629 |
# | construction_vehicle | 0.180 | 0.252 | 0.282 |
# | motorcycle | 0.247 | 0.328 | 0.343 |
# | pedestrian | 0.347 | 0.393 | 0.409 |
# | traffic_cone | 0.346 | 0.371 | 0.378 |
# | trailer | 0.209 | 0.292 | 0.384 |
# | truck | 0.452 | 0.544 | 0.587 |
# | driveable_surface | 0.562 | 0.646 | 0.734 |
# | other_flat | 0.290 | 0.328 | 0.363 |
# | sidewalk | 0.261 | 0.313 | 0.363 |
# | terrain | 0.260 | 0.330 | 0.394 |
# | manmade | 0.345 | 0.421 | 0.471 |
# | vegetation | 0.229 | 0.337 | 0.423 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.329 | 0.393 | 0.433 |
# +----------------------+----------+----------+----------+
# 6019it [10:36, 9.46it/s]
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.026 | 0.032 | 0.033 |
# | barrier | 0.184 | 0.232 | 0.253 |
# | bicycle | 0.088 | 0.103 | 0.108 |
# | bus | 0.311 | 0.406 | 0.458 |
# | car | 0.300 | 0.380 | 0.403 |
# | construction_vehicle | 0.032 | 0.057 | 0.081 |
# | motorcycle | 0.114 | 0.156 | 0.169 |
# | pedestrian | 0.025 | 0.030 | 0.031 |
# | traffic_cone | 0.071 | 0.081 | 0.085 |
# | trailer | 0.049 | 0.077 | 0.088 |
# | truck | 0.182 | 0.274 | 0.314 |
# | driveable_surface | 0.457 | 0.574 | 0.702 |
# | other_flat | 0.062 | 0.086 | 0.106 |
# | sidewalk | 0.018 | 0.042 | 0.091 |
# | terrain | 0.017 | 0.039 | 0.074 |
# | manmade | 0.077 | 0.144 | 0.194 |
# | vegetation | 0.002 | 0.061 | 0.162 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.119 | 0.163 | 0.197 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.3850202377154096, 'RayIoU@1': 0.3291477679560127, 'RayIoU@2': 0.39307010079658805, 'RayIoU@4': 0.4328428443936281,
# 'RayPQ': 0.15961266397677248, 'RayPQ@1': 0.11850092407498894, 'RayPQ@2': 0.1631862461686837, 'RayPQ@4': 0.19715082168664483}
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-longterm8f.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
8
+
1
,
1
)
model
=
dict
(
type
=
'BEVDepth4DOCC'
,
num_adj
=
multi_adj_frame_id_cfg
[
1
]
-
1
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
512
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 11.5
# ===> barrier - IoU = 44.1
# ===> bicycle - IoU = 25.89
# ===> bus - IoU = 41.0
# ===> car - IoU = 44.57
# ===> construction_vehicle - IoU = 21.88
# ===> motorcycle - IoU = 27.31
# ===> pedestrian - IoU = 25.95
# ===> traffic_cone - IoU = 29.04
# ===> trailer - IoU = 24.17
# ===> truck - IoU = 31.81
# ===> driveable_surface - IoU = 60.74
# ===> other_flat - IoU = 33.84
# ===> sidewalk - IoU = 36.62
# ===> terrain - IoU = 33.96
# ===> manmade - IoU = 21.54
# ===> vegetation - IoU = 21.36
# ===> mIoU of 6019 samples: 31.49
# {'mIoU': array([0.115, 0.441, 0.259, 0.41 , 0.446, 0.219, 0.273, 0.259, 0.29 ,
# 0.242, 0.318, 0.607, 0.338, 0.366, 0.34 , 0.215, 0.214, 0.839])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.107 | 0.115 | 0.116 |
# | barrier | 0.442 | 0.485 | 0.501 |
# | bicycle | 0.267 | 0.296 | 0.302 |
# | bus | 0.533 | 0.632 | 0.683 |
# | car | 0.516 | 0.590 | 0.616 |
# | construction_vehicle | 0.170 | 0.251 | 0.282 |
# | motorcycle | 0.231 | 0.325 | 0.350 |
# | pedestrian | 0.340 | 0.386 | 0.400 |
# | traffic_cone | 0.348 | 0.372 | 0.380 |
# | trailer | 0.232 | 0.317 | 0.400 |
# | truck | 0.427 | 0.514 | 0.559 |
# | driveable_surface | 0.566 | 0.649 | 0.736 |
# | other_flat | 0.302 | 0.341 | 0.374 |
# | sidewalk | 0.261 | 0.313 | 0.363 |
# | terrain | 0.258 | 0.333 | 0.399 |
# | manmade | 0.348 | 0.426 | 0.479 |
# | vegetation | 0.234 | 0.342 | 0.430 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.328 | 0.393 | 0.434 |
# +----------------------+----------+----------+----------+
# {'RayIoU': 0.3851476341258822, 'RayIoU@1': 0.3284556495395326, 'RayIoU@2': 0.39334760720480005, 'RayIoU@4': 0.43363964563331386}
\ No newline at end of file
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d-pano.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
40.0
,
-
40.0
,
-
5.0
,
40.0
,
40.0
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVDepth4DPano'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
512
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
aux_centerness_head
=
dict
(
type
=
'Centerness_Head'
,
task_specific_weight
=
[
1
,
1
,
0
,
0
,
0
],
in_channels
=
256
,
tasks
=
[
dict
(
num_class
=
10
,
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]),
],
common_heads
=
dict
(
reg
=
(
2
,
2
),
height
=
(
1
,
2
),
dim
=
(
3
,
2
),
rot
=
(
2
,
2
),
vel
=
(
2
,
2
)),
share_conv_channel
=
64
,
bbox_coder
=
dict
(
type
=
'CenterPointBBoxCoder'
,
pc_range
=
point_cloud_range
[:
2
],
post_center_range
=
[
-
61.2
,
-
61.2
,
-
10.0
,
61.2
,
61.2
,
10.0
],
max_num
=
500
,
score_threshold
=
0.3
,
#
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
code_size
=
9
),
separate_head
=
dict
(
type
=
'SeparateHead'
,
init_bias
=-
2.19
,
final_kernel
=
3
),
loss_cls
=
dict
(
type
=
'GaussianFocalLoss'
,
reduction
=
'mean'
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
reduction
=
'mean'
,
loss_weight
=
0.25
),
norm_bbox
=
True
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
point_cloud_range
=
point_cloud_range
,
grid_size
=
[
800
,
800
,
40
],
voxel_size
=
voxel_size
,
out_size_factor
=
4
,
dense_reg
=
1
,
gaussian_overlap
=
0.1
,
max_objs
=
500
,
min_radius
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
])),
test_cfg
=
dict
(
pts
=
dict
(
max_per_img
=
500
,
max_pool_nms
=
False
,
min_radius
=
[
4
,
12
,
10
,
1
,
0.85
,
0.175
],
score_threshold
=
0.1
,
out_size_factor
=
4
,
voxel_size
=
voxel_size
[:
2
],
pre_max_size
=
1000
,
post_max_size
=
500
,
# Scale-NMS
nms_type
=
[
'rotate'
],
nms_thr
=
[
0.2
],
nms_rescale_factor
=
[[
1.0
,
0.7
,
0.7
,
0.4
,
0.55
,
1.1
,
1.0
,
1.0
,
1.5
,
3.5
]]
)
),
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
,
'gt_bboxes_3d'
,
'gt_labels_3d'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# ===> per class IoU of 6019 samples:
# ===> others - IoU = 10.89
# ===> barrier - IoU = 43.92
# ===> bicycle - IoU = 24.42
# ===> bus - IoU = 41.91
# ===> car - IoU = 45.18
# ===> construction_vehicle - IoU = 18.73
# ===> motorcycle - IoU = 25.59
# ===> pedestrian - IoU = 25.67
# ===> traffic_cone - IoU = 25.86
# ===> trailer - IoU = 25.29
# ===> truck - IoU = 31.84
# ===> driveable_surface - IoU = 59.03
# ===> other_flat - IoU = 31.53
# ===> sidewalk - IoU = 34.67
# ===> terrain - IoU = 31.49
# ===> manmade - IoU = 19.91
# ===> vegetation - IoU = 19.31
# ===> mIoU of 6019 samples: 30.31
# {'mIoU': array([0.109, 0.439, 0.244, 0.419, 0.452, 0.187, 0.256, 0.257, 0.259,
# 0.253, 0.318, 0.59 , 0.315, 0.347, 0.315, 0.199, 0.193, 0.835])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.094 | 0.105 | 0.107 |
# | barrier | 0.411 | 0.460 | 0.480 |
# | bicycle | 0.252 | 0.286 | 0.293 |
# | bus | 0.541 | 0.646 | 0.698 |
# | car | 0.520 | 0.594 | 0.621 |
# | construction_vehicle | 0.164 | 0.235 | 0.264 |
# | motorcycle | 0.212 | 0.305 | 0.321 |
# | pedestrian | 0.326 | 0.373 | 0.389 |
# | traffic_cone | 0.312 | 0.341 | 0.348 |
# | trailer | 0.220 | 0.291 | 0.372 |
# | truck | 0.430 | 0.520 | 0.565 |
# | driveable_surface | 0.552 | 0.633 | 0.720 |
# | other_flat | 0.293 | 0.330 | 0.361 |
# | sidewalk | 0.242 | 0.291 | 0.340 |
# | terrain | 0.236 | 0.305 | 0.369 |
# | manmade | 0.303 | 0.378 | 0.429 |
# | vegetation | 0.193 | 0.294 | 0.381 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.312 | 0.376 | 0.415 |
# +----------------------+----------+----------+----------+
# 6019it [09:13, 10.87it/s]
# +----------------------+---------+---------+---------+
# | Class Names | RayPQ@1 | RayPQ@2 | RayPQ@4 |
# +----------------------+---------+---------+---------+
# | others | 0.020 | 0.028 | 0.030 |
# | barrier | 0.155 | 0.211 | 0.235 |
# | bicycle | 0.083 | 0.097 | 0.102 |
# | bus | 0.299 | 0.391 | 0.442 |
# | car | 0.277 | 0.360 | 0.384 |
# | construction_vehicle | 0.011 | 0.062 | 0.077 |
# | motorcycle | 0.098 | 0.149 | 0.166 |
# | pedestrian | 0.021 | 0.026 | 0.027 |
# | traffic_cone | 0.052 | 0.069 | 0.071 |
# | trailer | 0.043 | 0.062 | 0.071 |
# | truck | 0.158 | 0.248 | 0.293 |
# | driveable_surface | 0.440 | 0.559 | 0.680 |
# | other_flat | 0.065 | 0.089 | 0.107 |
# | sidewalk | 0.012 | 0.029 | 0.060 |
# | terrain | 0.009 | 0.028 | 0.053 |
# | manmade | 0.060 | 0.108 | 0.153 |
# | vegetation | 0.001 | 0.029 | 0.111 |
# +----------------------+---------+---------+---------+
# | MEAN | 0.106 | 0.150 | 0.180 |
# +----------------------+---------+---------+---------+
# {'RayIoU': 0.3676099569727112, 'RayIoU@1': 0.3118578145261225, 'RayIoU@2': 0.3757836068619914, 'RayIoU@4': 0.4151884495300196,
# 'RayPQ': 0.14529917059571107, 'RayPQ@1': 0.1061843618020449, 'RayPQ@2': 0.14961373290314467, 'RayPQ@4': 0.18009941708194366}
projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth4d.py
0 → 100644
View file @
3b8d508a
_base_
=
[
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py'
,
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
point_cloud_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
]
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
data_config
=
{
'cams'
:
[
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
],
'Ncams'
:
6
,
'input_size'
:
(
256
,
704
),
'src_size'
:
(
900
,
1600
),
# Augmentation
'resize'
:
(
-
0.06
,
0.11
),
'rot'
:
(
-
5.4
,
5.4
),
'flip'
:
True
,
'crop_h'
:
(
0.0
,
0.0
),
'resize_test'
:
0.00
,
}
grid_config
=
{
'x'
:
[
-
40
,
40
,
0.4
],
'y'
:
[
-
40
,
40
,
0.4
],
'z'
:
[
-
1
,
5.4
,
6.4
],
'depth'
:
[
1.0
,
45.0
,
0.5
],
}
voxel_size
=
[
0.1
,
0.1
,
0.2
]
numC_Trans
=
80
multi_adj_frame_id_cfg
=
(
1
,
1
+
1
,
1
)
model
=
dict
(
type
=
'BEVDepth4DOCC'
,
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
2
,
3
),
frozen_stages
=-
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
norm_eval
=
False
,
with_cp
=
True
,
style
=
'pytorch'
,
pretrained
=
'torchvision://resnet50'
,
),
img_neck
=
dict
(
type
=
'CustomFPN'
,
in_channels
=
[
1024
,
2048
],
out_channels
=
512
,
num_outs
=
1
,
start_level
=
0
,
out_ids
=
[
0
]),
img_view_transformer
=
dict
(
type
=
'LSSViewTransformerBEVDepth'
,
grid_config
=
grid_config
,
input_size
=
data_config
[
'input_size'
],
in_channels
=
512
,
out_channels
=
numC_Trans
,
loss_depth_weight
=
1
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
aspp_mid_channels
=
96
),
downsample
=
16
),
pre_process
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
,
num_layer
=
[
1
,
],
num_channels
=
[
numC_Trans
,
],
stride
=
[
1
,
],
backbone_output_ids
=
[
0
,
]),
img_bev_encoder_backbone
=
dict
(
type
=
'CustomResNet'
,
numC_input
=
numC_Trans
*
(
len
(
range
(
*
multi_adj_frame_id_cfg
))
+
1
),
num_channels
=
[
numC_Trans
*
2
,
numC_Trans
*
4
,
numC_Trans
*
8
]),
img_bev_encoder_neck
=
dict
(
type
=
'FPN_LSS'
,
in_channels
=
numC_Trans
*
8
+
numC_Trans
*
2
,
out_channels
=
256
),
occ_head
=
dict
(
type
=
'BEVOCCHead2D_V2'
,
in_dim
=
256
,
out_dim
=
256
,
Dz
=
16
,
use_mask
=
False
,
num_classes
=
18
,
use_predicter
=
True
,
class_balance
=
True
,
loss_occ
=
dict
(
type
=
'CustomFocalLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
)
)
# Data
dataset_type
=
'NuScenesDatasetOccpancy'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
bda_aug_conf
=
dict
(
rot_lim
=
(
-
0.
,
0.
),
scale_lim
=
(
1.
,
1.
),
flip_dx_ratio
=
0.5
,
flip_dy_ratio
=
0.5
)
train_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
is_train
=
True
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
True
),
dict
(
type
=
'LoadOccGTFromFile'
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'PointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'img_inputs'
,
'gt_depth'
,
'voxel_semantics'
,
'mask_lidar'
,
'mask_camera'
])
]
test_pipeline
=
[
dict
(
type
=
'PrepareImageInputs'
,
data_config
=
data_config
,
sequential
=
True
),
dict
(
type
=
'LoadAnnotationsBEVDepth'
,
bda_aug_conf
=
bda_aug_conf
,
classes
=
class_names
,
is_train
=
False
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1333
,
800
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'DefaultFormatBundle3D'
,
class_names
=
class_names
,
with_label
=
False
),
dict
(
type
=
'Collect3D'
,
keys
=
[
'points'
,
'img_inputs'
])
])
]
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
False
)
share_data_config
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
classes
=
class_names
,
modality
=
input_modality
,
stereo
=
False
,
filter_empty_gt
=
False
,
img_info_prototype
=
'bevdet4d'
,
multi_adj_frame_id_cfg
=
multi_adj_frame_id_cfg
,
)
test_data_config
=
dict
(
pipeline
=
test_pipeline
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_val.pkl'
)
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
train
=
dict
(
data_root
=
data_root
,
ann_file
=
data_root
+
'bevdetv2-nuscenes_infos_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
test_mode
=
False
,
use_valid_flag
=
True
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
test_data_config
,
test
=
test_data_config
)
for
key
in
[
'val'
,
'train'
,
'test'
]:
data
[
key
].
update
(
share_data_config
)
# Optimizer
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
1e-4
,
weight_decay
=
1e-2
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
5
,
norm_type
=
2
))
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
200
,
warmup_ratio
=
0.001
,
step
=
[
24
,
])
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
24
)
custom_hooks
=
[
dict
(
type
=
'MEGVIIEMAHook'
,
init_updates
=
10560
,
priority
=
'NORMAL'
,
),
]
load_from
=
"ckpts/bevdet-r50-4d-depth-cbgs.pth"
# fp16 = dict(loss_scale='dynamic')
evaluation
=
dict
(
interval
=
1
,
start
=
20
,
pipeline
=
test_pipeline
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
5
)
# use_mask = False
# ===> others - IoU = 9.99
# ===> barrier - IoU = 41.3
# ===> bicycle - IoU = 22.84
# ===> bus - IoU = 41.17
# ===> car - IoU = 41.89
# ===> construction_vehicle - IoU = 20.84
# ===> motorcycle - IoU = 25.25
# ===> pedestrian - IoU = 23.98
# ===> traffic_cone - IoU = 24.36
# ===> trailer - IoU = 26.39
# ===> truck - IoU = 30.41
# ===> driveable_surface - IoU = 58.26
# ===> other_flat - IoU = 31.86
# ===> sidewalk - IoU = 34.47
# ===> terrain - IoU = 31.96
# ===> manmade - IoU = 18.87
# ===> vegetation - IoU = 18.95
# ===> mIoU of 6019 samples: 29.57
# {'mIoU': array([0.1 , 0.413, 0.228, 0.412, 0.419, 0.208, 0.253, 0.24 , 0.244,
# 0.264, 0.304, 0.583, 0.319, 0.345, 0.32 , 0.189, 0.189, 0.833])}
# +----------------------+----------+----------+----------+
# | Class Names | RayIoU@1 | RayIoU@2 | RayIoU@4 |
# +----------------------+----------+----------+----------+
# | others | 0.095 | 0.106 | 0.109 |
# | barrier | 0.392 | 0.444 | 0.466 |
# | bicycle | 0.236 | 0.279 | 0.287 |
# | bus | 0.513 | 0.616 | 0.675 |
# | car | 0.492 | 0.567 | 0.596 |
# | construction_vehicle | 0.170 | 0.256 | 0.296 |
# | motorcycle | 0.216 | 0.304 | 0.330 |
# | pedestrian | 0.315 | 0.363 | 0.378 |
# | traffic_cone | 0.280 | 0.315 | 0.323 |
# | trailer | 0.210 | 0.294 | 0.397 |
# | truck | 0.419 | 0.517 | 0.565 |
# | driveable_surface | 0.540 | 0.621 | 0.708 |
# | other_flat | 0.284 | 0.320 | 0.354 |
# | sidewalk | 0.242 | 0.289 | 0.337 |
# | terrain | 0.233 | 0.302 | 0.367 |
# | manmade | 0.291 | 0.370 | 0.422 |
# | vegetation | 0.190 | 0.290 | 0.376 |
# +----------------------+----------+----------+----------+
# | MEAN | 0.301 | 0.368 | 0.411 |
# +----------------------+----------+----------+----------+
# {'RayIoU': 0.3599406945036808, 'RayIoU@1': 0.30094679699387594, 'RayIoU@2': 0.36785252629427645, 'RayIoU@4': 0.4110227602228899}
\ No newline at end of file
projects/mmdet3d_plugin/__init__.py
0 → 100644
View file @
3b8d508a
from
.datasets
import
*
from
.core
import
*
from
.models
import
*
projects/mmdet3d_plugin/core/__init__.py
0 → 100644
View file @
3b8d508a
from
.bbox
import
*
from
.hook
import
*
projects/mmdet3d_plugin/core/bbox/__init__.py
0 → 100644
View file @
3b8d508a
from
.coders
import
*
\ No newline at end of file
projects/mmdet3d_plugin/core/bbox/coders/__init__.py
0 → 100644
View file @
3b8d508a
from
.centerpoint_bbox_coders
import
CenterPointBBoxCoder
__all__
=
[
'CenterPointBBoxCoder'
]
\ No newline at end of file
projects/mmdet3d_plugin/core/bbox/coders/centerpoint_bbox_coders.py
0 → 100644
View file @
3b8d508a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
mmdet.core.bbox
import
BaseBBoxCoder
from
mmdet.core.bbox.builder
import
BBOX_CODERS
@
BBOX_CODERS
.
register_module
(
force
=
True
)
class
CenterPointBBoxCoder
(
BaseBBoxCoder
):
"""Bbox coder for CenterPoint.
Args:
pc_range (list[float]): Range of point cloud.
out_size_factor (int): Downsample factor of the model.
voxel_size (list[float]): Size of voxel.
post_center_range (list[float], optional): Limit of the center.
Default: None.
max_num (int, optional): Max number to be kept. Default: 100.
score_threshold (float, optional): Threshold to filter boxes
based on score. Default: None.
code_size (int, optional): Code size of bboxes. Default: 9
"""
def
__init__
(
self
,
pc_range
,
out_size_factor
,
voxel_size
,
post_center_range
=
None
,
max_num
=
100
,
score_threshold
=
None
,
code_size
=
9
):
self
.
pc_range
=
pc_range
# [x_min, y_min, ...]
self
.
out_size_factor
=
out_size_factor
self
.
voxel_size
=
voxel_size
self
.
post_center_range
=
post_center_range
# [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
self
.
max_num
=
max_num
self
.
score_threshold
=
score_threshold
self
.
code_size
=
code_size
def
_gather_feat
(
self
,
feats
,
inds
,
feat_masks
=
None
):
"""Given feats and indexes, returns the gathered feats.
Args:
feats (torch.Tensor): Features to be transposed and gathered
with the shape of [B, 2, W, H].
inds (torch.Tensor): Indexes with the shape of [B, N].
feat_masks (torch.Tensor, optional): Mask of the feats.
Default: None.
Returns:
torch.Tensor: Gathered feats.
"""
dim
=
feats
.
size
(
2
)
inds
=
inds
.
unsqueeze
(
2
).
expand
(
inds
.
size
(
0
),
inds
.
size
(
1
),
dim
)
feats
=
feats
.
gather
(
1
,
inds
)
if
feat_masks
is
not
None
:
feat_masks
=
feat_masks
.
unsqueeze
(
2
).
expand_as
(
feats
)
feats
=
feats
[
feat_masks
]
feats
=
feats
.
view
(
-
1
,
dim
)
return
feats
def
_topk
(
self
,
scores
,
K
=
80
):
"""Get indexes based on scores.
Args:
scores (torch.Tensor): scores with the shape of (B, N_cls, H, W).
K (int, optional): Number to be kept. Defaults to 80.
Returns:
tuple[torch.Tensor]
torch.Tensor: Selected scores with the shape of [B, K].
torch.Tensor: Selected indexes with the shape of [B, K].
torch.Tensor: Selected classes with the shape of [B, K].
torch.Tensor: Selected y coord with the shape of [B, K].
torch.Tensor: Selected x coord with the shape of [B, K].
"""
batch
,
cat
,
height
,
width
=
scores
.
size
()
# 先是针对每个类别的预测都取topK.
# (B, N_cls, K), (B, N_cls, K)
topk_scores
,
topk_inds
=
torch
.
topk
(
scores
.
view
(
batch
,
cat
,
-
1
),
K
)
topk_inds
=
topk_inds
%
(
height
*
width
)
# (B, N_cls, K), topK对应的像素索引(0, H*W-1).
topk_ys
=
(
topk_inds
.
float
()
/
torch
.
tensor
(
width
,
dtype
=
torch
.
float
)).
int
().
float
()
# (B, N_cls, K), y坐标.
topk_xs
=
(
topk_inds
%
width
).
int
().
float
()
# (B, N_cls, K), x坐标.
# 然后对将所有类别得到的topK数据再次进行topK.
# (B, K), (B, K)
topk_score
,
topk_ind
=
torch
.
topk
(
topk_scores
.
view
(
batch
,
-
1
),
K
)
topk_clses
=
(
topk_ind
/
torch
.
tensor
(
K
,
dtype
=
torch
.
float
)).
int
()
# (B, K) 对应的类别.
# (B, N_cls*K, 1) --gather--> (B, K, 1) --> (B, K) topK对应的像素坐标索引(0, H*W-1).
topk_inds
=
self
.
_gather_feat
(
topk_inds
.
view
(
batch
,
-
1
,
1
),
topk_ind
).
view
(
batch
,
K
)
# (B, N_cls*K, 1) --gather--> (B, K, 1) --> (B, K) topK对应的y坐标.
topk_ys
=
self
.
_gather_feat
(
topk_ys
.
view
(
batch
,
-
1
,
1
),
topk_ind
).
view
(
batch
,
K
)
# (B, N_cls*K, 1) --gather--> (B, K, 1) --> (B, K) topK对应的x坐标.
topk_xs
=
self
.
_gather_feat
(
topk_xs
.
view
(
batch
,
-
1
,
1
),
topk_ind
).
view
(
batch
,
K
)
return
topk_score
,
topk_inds
,
topk_clses
,
topk_ys
,
topk_xs
def
_transpose_and_gather_feat
(
self
,
feat
,
ind
):
"""Given feats and indexes, returns the transposed and gathered feats.
Args:
feat (torch.Tensor): Features to be transposed and gathered
with the shape of (B, N_c, H, W).
ind (torch.Tensor): Indexes with the shape of [B, K].
Returns:
torch.Tensor: Transposed and gathered feats.
"""
# (B, N_c, H, W) --> (B, H, W, N_c) --> (B, H*W, N_c)
feat
=
feat
.
permute
(
0
,
2
,
3
,
1
).
contiguous
()
feat
=
feat
.
view
(
feat
.
size
(
0
),
-
1
,
feat
.
size
(
3
))
feat
=
self
.
_gather_feat
(
feat
,
ind
)
# (B, K, N_c)
return
feat
def
encode
(
self
):
pass
def
decode
(
self
,
heat
,
rot_sine
,
rot_cosine
,
hei
,
dim
,
vel
,
reg
=
None
,
task_id
=-
1
):
"""Decode bboxes.
Args:
heat (torch.Tensor): Heatmap with the shape of (B, N_cls, H, W).
rot_sine (torch.Tensor): Sine of rotation with the shape of (B, 1, H, W).
rot_cosine (torch.Tensor): Cosine of rotation with the shape of (B, 1, H, W).
hei (torch.Tensor): Height of the boxes with the shape of (B, 1, H, W).
dim (torch.Tensor): Dim of the boxes with the shape of (B, 3, H, W).
vel (torch.Tensor): Velocity with the shape of (B, 1, H, W).
reg (torch.Tensor, optional): Regression value of the boxes in
2D with the shape of (B, 2, H, W). Default: None.
task_id (int, optional): Index of task. Default: -1.
Returns:
list[dict]: Decoded boxes. List[p_dict0, p_dict1, ...]
p_dict = {
'bboxes': boxes3d, # (K', 9)
'scores': scores, # (K', )
'labels': labels # (K', )
}
"""
batch
,
cat
,
_
,
_
=
heat
.
size
()
# (B, K)
scores
,
inds
,
clses
,
ys
,
xs
=
self
.
_topk
(
heat
,
K
=
self
.
max_num
)
if
reg
is
not
None
:
reg
=
self
.
_transpose_and_gather_feat
(
reg
,
inds
)
# (B, K, 2)
reg
=
reg
.
view
(
batch
,
self
.
max_num
,
2
)
xs
=
xs
.
view
(
batch
,
self
.
max_num
,
1
)
+
reg
[:,
:,
0
:
1
]
# (B, K, 1) + (B, K, 1) --> (B, K, 1)
ys
=
ys
.
view
(
batch
,
self
.
max_num
,
1
)
+
reg
[:,
:,
1
:
2
]
# (B, K, 1) + (B, K, 1) --> (B, K, 1)
else
:
xs
=
xs
.
view
(
batch
,
self
.
max_num
,
1
)
+
0.5
ys
=
ys
.
view
(
batch
,
self
.
max_num
,
1
)
+
0.5
# rotation value and direction label
rot_sine
=
self
.
_transpose_and_gather_feat
(
rot_sine
,
inds
)
# (B, K, 1)
rot_sine
=
rot_sine
.
view
(
batch
,
self
.
max_num
,
1
)
rot_cosine
=
self
.
_transpose_and_gather_feat
(
rot_cosine
,
inds
)
# (B, K, 1)
rot_cosine
=
rot_cosine
.
view
(
batch
,
self
.
max_num
,
1
)
rot
=
torch
.
atan2
(
rot_sine
,
rot_cosine
)
# (B, K, 1)
# height in the bev
hei
=
self
.
_transpose_and_gather_feat
(
hei
,
inds
)
hei
=
hei
.
view
(
batch
,
self
.
max_num
,
1
)
# (B, K, 1)
# dim of the box
dim
=
self
.
_transpose_and_gather_feat
(
dim
,
inds
)
dim
=
dim
.
view
(
batch
,
self
.
max_num
,
3
)
# (B, K, 3)
# class label
clses
=
clses
.
view
(
batch
,
self
.
max_num
).
float
()
# (B, K)
scores
=
scores
.
view
(
batch
,
self
.
max_num
)
# (B, K)
# 计算真实的bev坐标.
xs
=
xs
.
view
(
batch
,
self
.
max_num
,
1
)
*
self
.
out_size_factor
*
self
.
voxel_size
[
0
]
+
self
.
pc_range
[
0
]
ys
=
ys
.
view
(
batch
,
self
.
max_num
,
1
)
*
self
.
out_size_factor
*
self
.
voxel_size
[
1
]
+
self
.
pc_range
[
1
]
if
vel
is
None
:
# KITTI FORMAT
final_box_preds
=
torch
.
cat
([
xs
,
ys
,
hei
,
dim
,
rot
],
dim
=
2
)
else
:
# exist velocity, nuscene format
vel
=
self
.
_transpose_and_gather_feat
(
vel
,
inds
)
# (B, K, 2)
vel
=
vel
.
view
(
batch
,
self
.
max_num
,
2
)
final_box_preds
=
torch
.
cat
([
xs
,
ys
,
hei
,
dim
,
rot
,
vel
],
dim
=
2
)
# (B, K, 9)
final_scores
=
scores
final_preds
=
clses
# use score threshold
if
self
.
score_threshold
is
not
None
:
thresh_mask
=
final_scores
>
self
.
score_threshold
# (B, K)
if
self
.
post_center_range
is
not
None
:
self
.
post_center_range
=
torch
.
tensor
(
self
.
post_center_range
,
device
=
heat
.
device
)
mask
=
(
final_box_preds
[...,
:
3
]
>=
self
.
post_center_range
[:
3
]).
all
(
2
)
# (B, K, 3) --> (B, K)
mask
&=
(
final_box_preds
[...,
:
3
]
<=
self
.
post_center_range
[
3
:]).
all
(
2
)
# (B, K, 3) --> (B, K)
predictions_dicts
=
[]
for
i
in
range
(
batch
):
cmask
=
mask
[
i
,
:]
# (K, )
if
self
.
score_threshold
:
cmask
&=
thresh_mask
[
i
]
# (K, )
boxes3d
=
final_box_preds
[
i
,
cmask
]
# (K', 9)
scores
=
final_scores
[
i
,
cmask
]
# (K', )
labels
=
final_preds
[
i
,
cmask
]
# (K', )
predictions_dict
=
{
'bboxes'
:
boxes3d
,
# (K', 9)
'scores'
:
scores
,
# (K', )
'labels'
:
labels
# (K', )
}
# List[p_dict0, p_dict1, ...] len = batch_size
predictions_dicts
.
append
(
predictions_dict
)
else
:
raise
NotImplementedError
(
'Need to reorganize output as a batch, only '
'support post_center_range is not None for now!'
)
return
predictions_dicts
def
center_decode
(
self
,
heat
,
hei
,
reg
=
None
,
task_id
=-
1
):
batch
,
cat
,
_
,
_
=
heat
.
size
()
# (B, K)
scores
,
inds
,
clses
,
ys
,
xs
=
self
.
_topk
(
heat
,
K
=
self
.
max_num
)
if
reg
is
not
None
:
reg
=
self
.
_transpose_and_gather_feat
(
reg
,
inds
)
# (B, K, 2)
reg
=
reg
.
view
(
batch
,
self
.
max_num
,
2
)
xs
=
xs
.
view
(
batch
,
self
.
max_num
,
1
)
+
reg
[:,
:,
0
:
1
]
# (B, K, 1) + (B, K, 1) --> (B, K, 1)
ys
=
ys
.
view
(
batch
,
self
.
max_num
,
1
)
+
reg
[:,
:,
1
:
2
]
# (B, K, 1) + (B, K, 1) --> (B, K, 1)
else
:
xs
=
xs
.
view
(
batch
,
self
.
max_num
,
1
)
+
0.5
ys
=
ys
.
view
(
batch
,
self
.
max_num
,
1
)
+
0.5
# height in the bev
hei
=
self
.
_transpose_and_gather_feat
(
hei
,
inds
)
hei
=
hei
.
view
(
batch
,
self
.
max_num
,
1
)
# (B, K, 1)
# class label
clses
=
clses
.
view
(
batch
,
self
.
max_num
).
float
()
# (B, K)
scores
=
scores
.
view
(
batch
,
self
.
max_num
)
# (B, K)
# 计算真实的bev坐标.
xs
=
xs
.
view
(
batch
,
self
.
max_num
,
1
)
*
self
.
out_size_factor
*
self
.
voxel_size
[
0
]
+
self
.
pc_range
[
0
]
ys
=
ys
.
view
(
batch
,
self
.
max_num
,
1
)
*
self
.
out_size_factor
*
self
.
voxel_size
[
1
]
+
self
.
pc_range
[
1
]
final_center_preds
=
torch
.
cat
([
xs
,
ys
,
hei
],
dim
=
2
)
final_scores
=
scores
final_preds
=
clses
# use score threshold
if
self
.
score_threshold
is
not
None
:
thresh_mask
=
final_scores
>
self
.
score_threshold
# (B, K)
if
self
.
post_center_range
is
not
None
:
self
.
post_center_range
=
torch
.
tensor
(
self
.
post_center_range
,
device
=
heat
.
device
)
mask
=
(
final_center_preds
[...,
:
3
]
>=
self
.
post_center_range
[:
3
]).
all
(
2
)
# (B, K, 3) --> (B, K)
mask
&=
(
final_center_preds
[...,
:
3
]
<=
self
.
post_center_range
[
3
:]).
all
(
2
)
# (B, K, 3) --> (B, K)
predictions_dicts
=
[]
for
i
in
range
(
batch
):
cmask
=
mask
[
i
,
:]
# (K, )
if
self
.
score_threshold
:
cmask
&=
thresh_mask
[
i
]
# (K, )
centers
=
final_center_preds
[
i
,
cmask
]
# (K', 9)
scores
=
final_scores
[
i
,
cmask
]
# (K', )
labels
=
final_preds
[
i
,
cmask
]
# (K', )
predictions_dict
=
{
'centers'
:
centers
,
# (K', 9)
'scores'
:
scores
,
# (K', )
'labels'
:
labels
# (K', )
}
# List[p_dict0, p_dict1, ...] len = batch_size
predictions_dicts
.
append
(
predictions_dict
)
else
:
raise
NotImplementedError
(
'Need to reorganize output as a batch, only '
'support post_center_range is not None for now!'
)
return
predictions_dicts
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment