"tests/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "1edd0debaa3103e0dc230551c36e22ff60a56af4"
Commit ede10946 authored by Qing Lian's avatar Qing Lian Committed by ZwwWayne
Browse files

[Fix] Fix some loading bugs and support fov_image_based mode in Waymo dataset. (#1942)



* modify sample_id to sample_id and support fov_image_based on waymo dataset

* Update waymo_metric.py

* Minor fix

* Minor fix

* Minor fix

* Minor fix

* Minor fix

* Minor fix

* Minor fixes

* Minor fixes

* Remove optional

* fix dataset instances converting bugs

* Add a blank line to fix the doc compilation format

* Fix the bin file name in waymo_fov config

* Resolve conflicts

* fix ci and other things
Co-authored-by: default avatarTai-Wang <tab_wang@outlook.com>
Co-authored-by: default avatarlianqing11 <lianqing11@foxmail.com>
Co-authored-by: default avatarChaimZhu <zhuchenming@pjlab.org.cn>
parent 17ac0691
...@@ -52,6 +52,7 @@ train_dataloader = dict( ...@@ -52,6 +52,7 @@ train_dataloader = dict(
data_prefix=dict(img='training/image_2'), data_prefix=dict(img='training/image_2'),
pipeline=train_pipeline, pipeline=train_pipeline,
modality=input_modality, modality=input_modality,
load_type='fov_image_based',
test_mode=False, test_mode=False,
metainfo=metainfo, metainfo=metainfo,
# we use box_type_3d='Camera' in monocular 3d # we use box_type_3d='Camera' in monocular 3d
...@@ -70,6 +71,7 @@ val_dataloader = dict( ...@@ -70,6 +71,7 @@ val_dataloader = dict(
ann_file='kitti_infos_val.pkl', ann_file='kitti_infos_val.pkl',
pipeline=test_pipeline, pipeline=test_pipeline,
modality=input_modality, modality=input_modality,
load_type='fov_image_based',
metainfo=metainfo, metainfo=metainfo,
test_mode=True, test_mode=True,
box_type_3d='Camera')) box_type_3d='Camera'))
......
...@@ -65,7 +65,7 @@ train_dataloader = dict( ...@@ -65,7 +65,7 @@ train_dataloader = dict(
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'), CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_train.pkl', ann_file='nuscenes_infos_train.pkl',
task='mono_det', load_type='mv_image_based',
pipeline=train_pipeline, pipeline=train_pipeline,
metainfo=metainfo, metainfo=metainfo,
modality=input_modality, modality=input_modality,
...@@ -92,7 +92,7 @@ val_dataloader = dict( ...@@ -92,7 +92,7 @@ val_dataloader = dict(
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'), CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
ann_file='nuscenes_infos_val.pkl', ann_file='nuscenes_infos_val.pkl',
task='mono_det', load_type='mv_image_based',
pipeline=test_pipeline, pipeline=test_pipeline,
modality=input_modality, modality=input_modality,
metainfo=metainfo, metainfo=metainfo,
......
# dataset settings
# D3 in the config name means the whole dataset is divided into 3 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
class_names = ['Car', 'Pedestrian', 'Cyclist']
input_modality = dict(use_lidar=False, use_camera=True)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
train_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=False,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
# base shape (1248, 832), scale (0.95, 1.05)
dict(
type='RandomResize3D',
scale=(1284, 832),
ratio_range=(0.95, 1.05),
keep_ratio=True,
),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers_2d', 'depths'
]),
]
test_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img']),
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='RandomResize3D',
scale=(1248, 832),
ratio_range=(1., 1.),
keep_ratio=True),
dict(type='Pack3DDetInputs', keys=['img']),
]
metainfo = dict(CLASSES=class_names)
train_dataloader = dict(
batch_size=3,
num_workers=3,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_train.pkl',
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
load_type='fov_image_based',
# load one frame every three frames
load_interval=5))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
ann_file='waymo_infos_val.pkl',
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
load_type='fov_image_based',
))
test_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=dict(
pts='training/velodyne',
CAM_FRONT='training/image_0',
CAM_FRONT_RIGHT='training/image_1',
CAM_FRONT_LEFT='training/image_2',
CAM_SIDE_RIGHT='training/image_3',
CAM_SIDE_LEFT='training/image_4'),
ann_file='waymo_infos_val.pkl',
pipeline=eval_pipeline,
modality=input_modality,
test_mode=True,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera',
load_type='fov_image_based',
))
val_evaluator = dict(
type='WaymoMetric',
ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
data_root='./data/waymo/waymo_format',
metric='LET_mAP',
load_type='fov_image_based',
)
test_evaluator = val_evaluator
...@@ -81,7 +81,7 @@ train_dataloader = dict( ...@@ -81,7 +81,7 @@ train_dataloader = dict(
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset. # and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera', box_type_3d='Camera',
task='mono_det', load_type='mv_image_based',
# load one frame every three frames # load one frame every three frames
load_interval=5)) load_interval=5))
...@@ -109,7 +109,7 @@ val_dataloader = dict( ...@@ -109,7 +109,7 @@ val_dataloader = dict(
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset. # and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera', box_type_3d='Camera',
task='mono_det', load_type='mv_image_based',
)) ))
test_dataloader = dict( test_dataloader = dict(
...@@ -136,7 +136,7 @@ test_dataloader = dict( ...@@ -136,7 +136,7 @@ test_dataloader = dict(
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset. # and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Camera', box_type_3d='Camera',
task='mono_det', load_type='mv_image_based',
)) ))
val_evaluator = dict( val_evaluator = dict(
...@@ -145,5 +145,6 @@ val_evaluator = dict( ...@@ -145,5 +145,6 @@ val_evaluator = dict(
waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin', waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
data_root='./data/waymo/waymo_format', data_root='./data/waymo/waymo_format',
metric='LET_mAP', metric='LET_mAP',
task='mono_det') load_type='mv_image_based',
)
test_evaluator = val_evaluator test_evaluator = val_evaluator
_base_ = [
'../_base_/datasets/waymoD5-fov-mono3d-3class.py',
'../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)),
neck=dict(num_outs=3),
bbox_head=dict(
num_classes=3,
bbox_code_size=7,
pred_attrs=False,
pred_velo=False,
pred_bbox2d=True,
use_onlyreg_proj=True,
strides=(8, 16, 32),
regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
group_reg_dims=(2, 1, 3, 1, 16,
4), # offset, depth, size, rot, kpts, bbox2d
reg_branch=(
(256, ), # offset
(256, ), # depth
(256, ), # size
(256, ), # rot
(256, ), # kpts
(256, ) # bbox2d
),
centerness_branch=(256, ),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
use_depth_classifier=True,
depth_branch=(256, ),
depth_range=(0, 50),
depth_unit=10,
division='uniform',
depth_bins=6,
pred_keypoints=True,
weight_dim=1,
loss_depth=dict(
type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
loss_weight=1.0),
loss_bbox2d=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
bbox_coder=dict(
type='PGDBBoxCoder',
base_depths=((41.01, 18.44), ),
base_dims=(
(4.73, 1.77, 2.08),
(0.91, 1.74, 0.84),
(1.81, 1.77, 0.84),
),
code_size=7)),
# set weight 1.0 for base 7 dims (offset, depth, size, rot)
# 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
train_cfg=dict(code_weight=[
1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
]),
test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
# optimizer
optim_wrapper = dict(
optimizer=dict(
type='SGD',
lr=0.008,
),
paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
clip_grad=dict(max_norm=35, norm_type=2))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
by_epoch=False,
begin=0,
end=500),
dict(
type='MultiStepLR',
begin=0,
end=24,
by_epoch=True,
milestones=[16, 22],
gamma=0.1)
]
total_epochs = 24
runner = dict(max_epochs=total_epochs)
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
_base_ = [
'../_base_/datasets/waymoD5-mv-mono3d-3class.py',
'../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)),
neck=dict(num_outs=3),
bbox_head=dict(
num_classes=3,
bbox_code_size=7,
pred_attrs=False,
pred_velo=False,
pred_bbox2d=True,
use_onlyreg_proj=True,
strides=(8, 16, 32),
regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
group_reg_dims=(2, 1, 3, 1, 16,
4), # offset, depth, size, rot, kpts, bbox2d
reg_branch=(
(256, ), # offset
(256, ), # depth
(256, ), # size
(256, ), # rot
(256, ), # kpts
(256, ) # bbox2d
),
centerness_branch=(256, ),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_centerness=dict(
type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
use_depth_classifier=True,
depth_branch=(256, ),
depth_range=(0, 50),
depth_unit=10,
division='uniform',
depth_bins=6,
pred_keypoints=True,
weight_dim=1,
loss_depth=dict(
type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
loss_weight=1.0),
loss_bbox2d=dict(
type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
bbox_coder=dict(
type='PGDBBoxCoder',
base_depths=((41.01, 18.44), ),
base_dims=(
(4.73, 1.77, 2.08),
(0.91, 1.74, 0.84),
(1.81, 1.77, 0.84),
),
code_size=7)),
# set weight 1.0 for base 7 dims (offset, depth, size, rot)
# 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
train_cfg=dict(code_weight=[
1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
]),
test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
# optimizer
optim_wrapper = dict(
optimizer=dict(
type='SGD',
lr=0.008,
),
paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
clip_grad=dict(max_norm=35, norm_type=2))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
by_epoch=False,
begin=0,
end=500),
dict(
type='MultiStepLR',
begin=0,
end=24,
by_epoch=True,
milestones=[16, 22],
gamma=0.1)
]
total_epochs = 24
runner = dict(max_epochs=total_epochs)
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
...@@ -32,6 +32,15 @@ class KittiDataset(Det3DDataset): ...@@ -32,6 +32,15 @@ class KittiDataset(Det3DDataset):
- 'LiDAR': Box in LiDAR coordinates. - 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates. - 'Camera': Box in camera coordinates.
load_type (str): Type of loading mode. Defaults to 'frame_based'.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_based': Only load the instances inside the default
cam, and need to convert to the FOV-based data type to support
image-based detector.
filter_empty_gt (bool): Whether to filter the data with empty GT. filter_empty_gt (bool): Whether to filter the data with empty GT.
If it's set to be True, the example with empty annotations after If it's set to be True, the example with empty annotations after
data pipeline will be dropped and a random example will be chosen data pipeline will be dropped and a random example will be chosen
...@@ -54,7 +63,7 @@ class KittiDataset(Det3DDataset): ...@@ -54,7 +63,7 @@ class KittiDataset(Det3DDataset):
pipeline: List[Union[dict, Callable]] = [], pipeline: List[Union[dict, Callable]] = [],
modality: dict = dict(use_lidar=True), modality: dict = dict(use_lidar=True),
default_cam_key: str = 'CAM2', default_cam_key: str = 'CAM2',
task: str = 'lidar_det', load_type: str = 'frame_based',
box_type_3d: str = 'LiDAR', box_type_3d: str = 'LiDAR',
filter_empty_gt: bool = True, filter_empty_gt: bool = True,
test_mode: bool = False, test_mode: bool = False,
...@@ -62,8 +71,9 @@ class KittiDataset(Det3DDataset): ...@@ -62,8 +71,9 @@ class KittiDataset(Det3DDataset):
**kwargs) -> None: **kwargs) -> None:
self.pcd_limit_range = pcd_limit_range self.pcd_limit_range = pcd_limit_range
assert task in ('lidar_det', 'mono_det') assert load_type in ('frame_based', 'mv_image_based',
self.task = task 'fov_image_based')
self.load_type = load_type
super().__init__( super().__init__(
data_root=data_root, data_root=data_root,
ann_file=ann_file, ann_file=ann_file,
...@@ -113,7 +123,7 @@ class KittiDataset(Det3DDataset): ...@@ -113,7 +123,7 @@ class KittiDataset(Det3DDataset):
info['plane'] = plane_lidar info['plane'] = plane_lidar
if self.task == 'mono_det' and self.load_eval_anns: if self.load_type == 'fov_image_based' and self.load_eval_anns:
info['instances'] = info['cam_instances'][self.default_cam_key] info['instances'] = info['cam_instances'][self.default_cam_key]
info = super().parse_data_info(info) info = super().parse_data_info(info)
...@@ -144,7 +154,7 @@ class KittiDataset(Det3DDataset): ...@@ -144,7 +154,7 @@ class KittiDataset(Det3DDataset):
ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32) ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64) ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
if self.task == 'mono_det': if self.load_type in ['fov_image_based', 'mv_image_based']:
ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32) ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64) ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32) ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
......
...@@ -22,7 +22,6 @@ class NuScenesDataset(Det3DDataset): ...@@ -22,7 +22,6 @@ class NuScenesDataset(Det3DDataset):
Args: Args:
data_root (str): Path of dataset root. data_root (str): Path of dataset root.
ann_file (str): Path of annotation file. ann_file (str): Path of annotation file.
task (str): Detection task. Defaults to 'lidar_det'.
pipeline (list[dict]): Pipeline used for data processing. pipeline (list[dict]): Pipeline used for data processing.
Defaults to []. Defaults to [].
box_type_3d (str): Type of 3D box of this dataset. box_type_3d (str): Type of 3D box of this dataset.
...@@ -33,6 +32,15 @@ class NuScenesDataset(Det3DDataset): ...@@ -33,6 +32,15 @@ class NuScenesDataset(Det3DDataset):
- 'LiDAR': Box in LiDAR coordinates. - 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates. - 'Camera': Box in camera coordinates.
load_type (str): Type of loading mode. Defaults to 'frame_based'.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_based': Only load the instances inside the default
cam, and need to convert to the FOV-based data type to support
image-based detector.
modality (dict): Modality to specify the sensor data used as input. modality (dict): Modality to specify the sensor data used as input.
Defaults to dict(use_camera=False, use_lidar=True). Defaults to dict(use_camera=False, use_lidar=True).
filter_empty_gt (bool): Whether to filter the data with empty GT. filter_empty_gt (bool): Whether to filter the data with empty GT.
...@@ -58,9 +66,9 @@ class NuScenesDataset(Det3DDataset): ...@@ -58,9 +66,9 @@ class NuScenesDataset(Det3DDataset):
def __init__(self, def __init__(self,
data_root: str, data_root: str,
ann_file: str, ann_file: str,
task: str = 'lidar_det',
pipeline: List[Union[dict, Callable]] = [], pipeline: List[Union[dict, Callable]] = [],
box_type_3d: str = 'LiDAR', box_type_3d: str = 'LiDAR',
load_type: str = 'frame_based',
modality: dict = dict( modality: dict = dict(
use_camera=False, use_camera=False,
use_lidar=True, use_lidar=True,
...@@ -74,8 +82,9 @@ class NuScenesDataset(Det3DDataset): ...@@ -74,8 +82,9 @@ class NuScenesDataset(Det3DDataset):
self.with_velocity = with_velocity self.with_velocity = with_velocity
# TODO: Redesign multi-view data process in the future # TODO: Redesign multi-view data process in the future
assert task in ('lidar_det', 'mono_det', 'multi-view_det') assert load_type in ('frame_based', 'mv_image_based',
self.task = task 'fov_image_based')
self.load_type = load_type
assert box_type_3d.lower() in ('lidar', 'camera') assert box_type_3d.lower() in ('lidar', 'camera')
super().__init__( super().__init__(
...@@ -144,7 +153,7 @@ class NuScenesDataset(Det3DDataset): ...@@ -144,7 +153,7 @@ class NuScenesDataset(Det3DDataset):
ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32) ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64) ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
if self.task == 'mono3d': if self.load_type in ['fov_image_based', 'mv_image_based']:
ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32) ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64) ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
ann_info['attr_labels'] = np.array(0, dtype=np.int64) ann_info['attr_labels'] = np.array(0, dtype=np.int64)
...@@ -154,7 +163,7 @@ class NuScenesDataset(Det3DDataset): ...@@ -154,7 +163,7 @@ class NuScenesDataset(Det3DDataset):
# the nuscenes box center is [0.5, 0.5, 0.5], we change it to be # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0) # the same as KITTI (0.5, 0.5, 0)
# TODO: Unify the coordinates # TODO: Unify the coordinates
if self.task == 'mono_det': if self.load_type in ['fov_image_based', 'mv_image_based']:
gt_bboxes_3d = CameraInstance3DBoxes( gt_bboxes_3d = CameraInstance3DBoxes(
ann_info['gt_bboxes_3d'], ann_info['gt_bboxes_3d'],
box_dim=ann_info['gt_bboxes_3d'].shape[-1], box_dim=ann_info['gt_bboxes_3d'].shape[-1],
...@@ -182,7 +191,7 @@ class NuScenesDataset(Det3DDataset): ...@@ -182,7 +191,7 @@ class NuScenesDataset(Det3DDataset):
dict: Has `ann_info` in training stage. And dict: Has `ann_info` in training stage. And
all path has been converted to absolute path. all path has been converted to absolute path.
""" """
if self.task == 'mono_det': if self.load_type == 'mv_image_based':
data_list = [] data_list = []
if self.modality['use_lidar']: if self.modality['use_lidar']:
info['lidar_points']['lidar_path'] = \ info['lidar_points']['lidar_path'] = \
......
...@@ -45,6 +45,15 @@ class WaymoDataset(KittiDataset): ...@@ -45,6 +45,15 @@ class WaymoDataset(KittiDataset):
- 'LiDAR': Box in LiDAR coordinates. - 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates. - 'Camera': Box in camera coordinates.
load_type (str): Type of loading mode. Defaults to 'frame_based'.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_based': Only load the instances inside the default
cam, and need to convert to the FOV-based data type to support
image-based detector.
filter_empty_gt (bool): Whether to filter the data with empty GT. filter_empty_gt (bool): Whether to filter the data with empty GT.
If it's set to be True, the example with empty annotations after If it's set to be True, the example with empty annotations after
data pipeline will be dropped and a random example will be chosen data pipeline will be dropped and a random example will be chosen
...@@ -57,10 +66,6 @@ class WaymoDataset(KittiDataset): ...@@ -57,10 +66,6 @@ class WaymoDataset(KittiDataset):
cam_sync_instances (bool): If use the camera sync label cam_sync_instances (bool): If use the camera sync label
supported from waymo version 1.3.1. Defaults to False. supported from waymo version 1.3.1. Defaults to False.
load_interval (int): load frame interval. Defaults to 1. load_interval (int): load frame interval. Defaults to 1.
task (str): task for 3D detection (lidar, mono3d).
lidar: take all the ground trurh in the frame.
mono3d: take the groundtruth that can be seen in the cam.
Defaults to 'lidar_det'.
max_sweeps (int): max sweep for each frame. Defaults to 0. max_sweeps (int): max sweep for each frame. Defaults to 0.
""" """
METAINFO = {'classes': ('Car', 'Pedestrian', 'Cyclist')} METAINFO = {'classes': ('Car', 'Pedestrian', 'Cyclist')}
...@@ -79,12 +84,12 @@ class WaymoDataset(KittiDataset): ...@@ -79,12 +84,12 @@ class WaymoDataset(KittiDataset):
modality: dict = dict(use_lidar=True), modality: dict = dict(use_lidar=True),
default_cam_key: str = 'CAM_FRONT', default_cam_key: str = 'CAM_FRONT',
box_type_3d: str = 'LiDAR', box_type_3d: str = 'LiDAR',
load_type: str = 'frame_based',
filter_empty_gt: bool = True, filter_empty_gt: bool = True,
test_mode: bool = False, test_mode: bool = False,
pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0], pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
cam_sync_instances: bool = False, cam_sync_instances: bool = False,
load_interval: int = 1, load_interval: int = 1,
task: str = 'lidar_det',
max_sweeps: int = 0, max_sweeps: int = 0,
**kwargs) -> None: **kwargs) -> None:
self.load_interval = load_interval self.load_interval = load_interval
...@@ -108,7 +113,7 @@ class WaymoDataset(KittiDataset): ...@@ -108,7 +113,7 @@ class WaymoDataset(KittiDataset):
default_cam_key=default_cam_key, default_cam_key=default_cam_key,
data_prefix=data_prefix, data_prefix=data_prefix,
test_mode=test_mode, test_mode=test_mode,
task=task, load_type=load_type,
**kwargs) **kwargs)
def parse_ann_info(self, info: dict) -> dict: def parse_ann_info(self, info: dict) -> dict:
...@@ -151,7 +156,7 @@ class WaymoDataset(KittiDataset): ...@@ -151,7 +156,7 @@ class WaymoDataset(KittiDataset):
centers_2d = np.zeros((0, 2), dtype=np.float32) centers_2d = np.zeros((0, 2), dtype=np.float32)
depths = np.zeros((0), dtype=np.float32) depths = np.zeros((0), dtype=np.float32)
if self.task == 'mono_det': if self.load_type in ['fov_image_based', 'mv_image_based']:
gt_bboxes_3d = CameraInstance3DBoxes( gt_bboxes_3d = CameraInstance3DBoxes(
ann_info['gt_bboxes_3d'], ann_info['gt_bboxes_3d'],
box_dim=ann_info['gt_bboxes_3d'].shape[-1], box_dim=ann_info['gt_bboxes_3d'].shape[-1],
...@@ -186,10 +191,19 @@ class WaymoDataset(KittiDataset): ...@@ -186,10 +191,19 @@ class WaymoDataset(KittiDataset):
def parse_data_info(self, info: dict) -> dict: def parse_data_info(self, info: dict) -> dict:
"""if task is lidar or multiview det, use super() method elif task is """if task is lidar or multiview det, use super() method elif task is
mono3d, split the info from frame-wise to img-wise.""" mono3d, split the info from frame-wise to img-wise."""
if self.task != 'mono_det':
if self.cam_sync_instances: if self.cam_sync_instances:
# use the cam sync labels info['instances'] = info['cam_sync_instances']
info['instances'] = info['cam_sync_instances']
if self.load_type == 'frame_based':
return super().parse_data_info(info)
elif self.load_type == 'fov_image_based':
# only loading the fov image and the fov instance
new_image_info = {}
new_image_info[self.default_cam_key] = \
info['images'][self.default_cam_key]
info['images'] = new_image_info
info['instances'] = info['cam_instances'][self.default_cam_key]
return super().parse_data_info(info) return super().parse_data_info(info)
else: else:
# in the mono3d, the instances is from cam sync. # in the mono3d, the instances is from cam sync.
...@@ -222,7 +236,7 @@ class WaymoDataset(KittiDataset): ...@@ -222,7 +236,7 @@ class WaymoDataset(KittiDataset):
# TODO check if need to modify the sample id # TODO check if need to modify the sample id
# TODO check when will use it except for evaluation. # TODO check when will use it except for evaluation.
camera_info['sample_id'] = info['sample_id'] camera_info['sample_idx'] = info['sample_idx']
if not self.test_mode: if not self.test_mode:
# used in training # used in training
......
...@@ -45,8 +45,15 @@ class WaymoMetric(KittiMetric): ...@@ -45,8 +45,15 @@ class WaymoMetric(KittiMetric):
submission_prefix (str, optional): The prefix of submission data. submission_prefix (str, optional): The prefix of submission data.
If not specified, the submission data will not be generated. If not specified, the submission data will not be generated.
Default: None. Default: None.
task: (str, optional): task for 3D detection, if cam, would filter load_type (str, optional): Type of loading mode during training.
the points that outside the image.
- 'frame_based': Load all of the instances in the frame.
- 'mv_image_based': Load all of the instances in the frame and need
to convert to the FOV-based data type to support image-based
detector.
- 'fov_image_base': Only load the instances inside the default cam,
and need to convert to the FOV-based data type to support
image-based detector.
default_cam_key (str, optional): The default camera for lidar to default_cam_key (str, optional): The default camera for lidar to
camear conversion. By default, KITTI: CAM2, Waymo: CAM_FRONT camear conversion. By default, KITTI: CAM2, Waymo: CAM_FRONT
use_pred_sample_idx (bool, optional): In formating results, use the use_pred_sample_idx (bool, optional): In formating results, use the
...@@ -76,7 +83,7 @@ class WaymoMetric(KittiMetric): ...@@ -76,7 +83,7 @@ class WaymoMetric(KittiMetric):
prefix: Optional[str] = None, prefix: Optional[str] = None,
pklfile_prefix: str = None, pklfile_prefix: str = None,
submission_prefix: str = None, submission_prefix: str = None,
task='lidar_det', load_type: str = 'frame_based',
default_cam_key: str = 'CAM_FRONT', default_cam_key: str = 'CAM_FRONT',
use_pred_sample_idx: bool = False, use_pred_sample_idx: bool = False,
collect_device: str = 'cpu', collect_device: str = 'cpu',
...@@ -85,7 +92,7 @@ class WaymoMetric(KittiMetric): ...@@ -85,7 +92,7 @@ class WaymoMetric(KittiMetric):
self.waymo_bin_file = waymo_bin_file self.waymo_bin_file = waymo_bin_file
self.data_root = data_root self.data_root = data_root
self.split = split self.split = split
self.task = task self.load_type = load_type
self.use_pred_sample_idx = use_pred_sample_idx self.use_pred_sample_idx = use_pred_sample_idx
self.convert_kitti_format = convert_kitti_format self.convert_kitti_format = convert_kitti_format
...@@ -124,8 +131,8 @@ class WaymoMetric(KittiMetric): ...@@ -124,8 +131,8 @@ class WaymoMetric(KittiMetric):
assert len(results) == len(self.data_infos), \ assert len(results) == len(self.data_infos), \
'invalid list length of network outputs' 'invalid list length of network outputs'
# different from kitti, waymo do not need to convert the ann file # different from kitti, waymo do not need to convert the ann file
# handle the mono3d task # handle the mv_image_based load_mode
if self.task == 'mono_det': if self.load_type == 'mv_image_based':
new_data_infos = [] new_data_infos = []
for info in self.data_infos: for info in self.data_infos:
height = info['images'][self.default_cam_key]['height'] height = info['images'][self.default_cam_key]['height']
...@@ -425,7 +432,7 @@ class WaymoMetric(KittiMetric): ...@@ -425,7 +432,7 @@ class WaymoMetric(KittiMetric):
lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img'] lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
lidar2cam = np.array(lidar2cam).astype(np.float32) lidar2cam = np.array(lidar2cam).astype(np.float32)
box_preds_camera = box_preds_lidar.convert_to( box_preds_camera = box_preds_lidar.convert_to(
Box3DMode.CAM, np.linalg.inv(lidar2cam), correct_yaw=True) Box3DMode.CAM, lidar2cam, correct_yaw=True)
# Note: bbox is meaningless in final evaluation, set to 0 # Note: bbox is meaningless in final evaluation, set to 0
merged_box_dict = dict( merged_box_dict = dict(
bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]), bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
...@@ -470,7 +477,7 @@ class WaymoMetric(KittiMetric): ...@@ -470,7 +477,7 @@ class WaymoMetric(KittiMetric):
sample_idx = sample_id_list[idx] sample_idx = sample_id_list[idx]
info = self.data_infos[sample_idx] info = self.data_infos[sample_idx]
if self.task == 'mono_det': if self.load_type == 'mv_image_based':
if idx % self.num_cams == 0: if idx % self.num_cams == 0:
box_dict_per_frame = [] box_dict_per_frame = []
cam0_key = list(info['images'].keys())[0] cam0_key = list(info['images'].keys())[0]
...@@ -487,7 +494,7 @@ class WaymoMetric(KittiMetric): ...@@ -487,7 +494,7 @@ class WaymoMetric(KittiMetric):
# If you want to use another camera, please modify it. # If you want to use another camera, please modify it.
image_shape = (info['images'][self.default_cam_key]['height'], image_shape = (info['images'][self.default_cam_key]['height'],
info['images'][self.default_cam_key]['width']) info['images'][self.default_cam_key]['width'])
if self.task == 'mono_det': if self.load_type == 'mv_image_based':
box_dict_per_frame.append(box_dict) box_dict_per_frame.append(box_dict)
if (idx + 1) % self.num_cams != 0: if (idx + 1) % self.num_cams != 0:
continue continue
...@@ -587,7 +594,7 @@ class WaymoMetric(KittiMetric): ...@@ -587,7 +594,7 @@ class WaymoMetric(KittiMetric):
def convert_valid_bboxes(self, box_dict: dict, info: dict): def convert_valid_bboxes(self, box_dict: dict, info: dict):
"""Convert the predicted boxes into valid ones. Should handle the """Convert the predicted boxes into valid ones. Should handle the
different task mode (mono3d, mv3d, lidar), separately. load_model (frame_based, mv_image_based, fov_image_based), separately.
Args: Args:
box_dict (dict): Box dictionaries to be converted. box_dict (dict): Box dictionaries to be converted.
...@@ -624,11 +631,11 @@ class WaymoMetric(KittiMetric): ...@@ -624,11 +631,11 @@ class WaymoMetric(KittiMetric):
scores=np.zeros([0]), scores=np.zeros([0]),
label_preds=np.zeros([0, 4]), label_preds=np.zeros([0, 4]),
sample_idx=sample_idx) sample_idx=sample_idx)
# Here default used 'CAM2' to compute metric. If you want to # Here default used 'CAM_FRONT' to compute metric. If you want to
# use another camera, please modify it. # use another camera, please modify it.
if self.task in ['mv3d_det', 'lidar_det']: if self.load_type in ['frame_based', 'fov_image_based']:
cam_key = self.default_cam_key cam_key = self.default_cam_key
elif self.task == 'mono_det': elif self.load_type == 'mv_image_based':
cam_key = list(info['images'].keys())[0] cam_key = list(info['images'].keys())[0]
else: else:
raise NotImplementedError raise NotImplementedError
...@@ -661,12 +668,12 @@ class WaymoMetric(KittiMetric): ...@@ -661,12 +668,12 @@ class WaymoMetric(KittiMetric):
(box_2d_preds[:, 1] < image_shape[0]) & (box_2d_preds[:, 1] < image_shape[0]) &
(box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0)) (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
# check box_preds_lidar # check box_preds_lidar
if self.task in ['mv3d_det', 'lidar_det']: if self.load_type in ['frame_based']:
limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range) limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) & valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
(box_preds_lidar.center < limit_range[3:])) (box_preds_lidar.center < limit_range[3:]))
valid_inds = valid_pcd_inds.all(-1) valid_inds = valid_pcd_inds.all(-1)
elif self.task == 'mono_det': if self.load_type in ['mv_image_based', 'fov_image_based']:
valid_inds = valid_cam_inds valid_inds = valid_cam_inds
if valid_inds.sum() > 0: if valid_inds.sum() > 0:
......
...@@ -133,6 +133,7 @@ class Waymo2KITTI(object): ...@@ -133,6 +133,7 @@ class Waymo2KITTI(object):
self.save_image(frame, file_idx, frame_idx) self.save_image(frame, file_idx, frame_idx)
self.save_calib(frame, file_idx, frame_idx) self.save_calib(frame, file_idx, frame_idx)
if 'testing_3d_camera_only_detection' not in self.load_dir: if 'testing_3d_camera_only_detection' not in self.load_dir:
# the camera only split doesn't contain lidar points.
self.save_lidar(frame, file_idx, frame_idx) self.save_lidar(frame, file_idx, frame_idx)
self.save_pose(frame, file_idx, frame_idx) self.save_pose(frame, file_idx, frame_idx)
self.save_timestamp(frame, file_idx, frame_idx) self.save_timestamp(frame, file_idx, frame_idx)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment